diff --git a/graphengine b/graphengine index b56450bde6d5afa1c557437ebf154487afe355f0..236001806129e36c0f48b240c4f61b2e1d92c470 160000 --- a/graphengine +++ b/graphengine @@ -1 +1 @@ -Subproject commit b56450bde6d5afa1c557437ebf154487afe355f0 +Subproject commit 236001806129e36c0f48b240c4f61b2e1d92c470 diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h index 1df19e6d37d1ac8096e0ce439cd9bb8a65468100..8daaa4c12d2c0714cf65c0d54d7b18163f2f2a11 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h @@ -24,10 +24,8 @@ class LogitCpuKernel : public CpuKernel { public: LogitCpuKernel() = default; ~LogitCpuKernel() override = default; - protected: uint32_t Compute(CpuKernelContext &ctx) override; - private: template uint32_t LogitCompute(CpuKernelContext &ctx); diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h index c3b600b234ff591bcf1a385c2eea08d862854a72..aabd3121c43ef07606af24bc6e64ca60706812e0 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h @@ -23,6 +23,7 @@ typedef struct AttentionParameter { int head_num_; int head_size_; bool cross_; + float sclae_; } AttentionParameter; typedef struct RelativePositionAttentionParameter { diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h new file mode 100644 index 0000000000000000000000000000000000000000..4c3254fc2f5652e584e3254d5978aa420c933c78 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h @@ -0,0 +1,38 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_ + +#include "nnacl/op_base.h" + +typedef struct DecoderLayerParameter { + OpParameter op_parameter_; + int head_num_; + int head_size_; + bool post_layernorm_; + float eps_layernorm1_; + float eps_layernorm2_; + float eps_layernorm3_; + int ffn_hidden_size_; + bool position_bias1_; + bool position_bias2_; + float scale1_; + float scale2_; + ActType act_type_; +} DecoderLayerParameter; + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c new file mode 100644 index 0000000000000000000000000000000000000000..401acdea7ab089ddc3e77a37aaac3268236403c7 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c @@ -0,0 +1,37 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "nnacl/infer/decoder_layer_infer.h" +#include "nnacl/infer/infer_register.h" +#include "nnacl/decoder_layer_parameter.h" + +int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size, + OpParameter *parameter) { + int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, C16NUM, C1NUM); + if (check_ret != NNACL_OK) { + return check_ret; + } + const TensorC *input = inputs[FIRST_INPUT]; + TensorC *output0 = outputs[FIRST_INPUT]; + SetDataTypeFormat(output0, input); + if (!InferFlag(inputs, inputs_size)) { + return NNACL_INFER_INVALID; + } + SetShapeTensor(output0, input); + return NNACL_OK; +} + +REG_INFER(DecoderLayer, PrimType_Inner_DecoderLayer, DecoderLayerInferShape) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h new file mode 100644 index 0000000000000000000000000000000000000000..facdcc50662785ff9a86cab7e49980b08f245311 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h @@ -0,0 +1,31 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_INFER_DECODER_LAYER_INFER_H_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_INFER_DECODER_LAYER_INFER_H_ + +#include "nnacl/infer/common_infer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size, + OpParameter *parameter); + +#ifdef __cplusplus +} +#endif +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_INFER_DECODER_LAYER_INFER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c index f9ddf940dd2be79af5b7cb1ad6213b9a21ce73d4..042ccf149dcaaa385c20051258ec6a086e8c9ef4 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c @@ -42,6 +42,8 @@ #include "nnacl/infer/common_infer.h" #include "nnacl/infer/concat_infer.h" #include "nnacl/infer/constant_of_shape_infer.h" +#include "nnacl/infer/decoder_layer_infer.h" + #ifdef MSLITE_ENABLE_CONTROLFLOW #include "nnacl/infer/control/tensor_array_infer.h" #include "nnacl/infer/control/tensor_array_read_infer.h" @@ -404,7 +406,7 @@ void RegAllInferFunc5() { #ifndef RUNTIME_PASS_CLIP g_inner_op_infer_func[PrimType_Inner_ShapeFusion - PrimType_InnerOpMin] = ShapeFusionInferShape; g_inner_op_infer_func[PrimType_Inner_EncoderLayer - PrimType_InnerOpMin] = EncoderLayerInferShape; - + g_inner_op_infer_func[PrimType_Inner_DecoderLayer - PrimType_InnerOpMin] = DecoderLayerInferShape; #endif g_inner_op_infer_func[PrimType_Inner_ToFormat - PrimType_InnerOpMin] = NULL; } diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h index ad88e3cc7053a7218dbdcf8c0cd09b8f4ef897ee..2ed8cb8cdbdb61a52a0e2f9a884ed7ca436bbeea 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h @@ -41,9 +41,15 @@ #define C12NUM 12 #define C13NUM 13 #define C14NUM 14 +#define C15NUM 15 #define C16NUM 16 +#define C17NUM 17 +#define C18NUM 18 +#define C19NUM 19 #define C20NUM 20 #define C21NUM 21 +#define C22NUM 22 +#define C23NUM 23 #define C24NUM 24 #define C28NUM 28 #define C32NUM 32 @@ -536,6 +542,7 @@ enum PrimType { PrimType_Inner_GraphKernel = 10004, PrimType_Inner_SplitReduceConcatFusion = 10005, PrimType_Inner_EncoderLayer = 10006, + PrimType_Inner_DecoderLayer = 10007, PrimType_InnerOpMax, PrimType_InnerOpMin = PrimType_Inner_ToFormat }; @@ -654,7 +661,7 @@ typedef struct QuantMulArg { } QuantMulArg; typedef enum ReductionType { Reduction_Sum, Reduction_Mean, Reduction_None } ReductionType; -typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu } ActType; +typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu, ActType_Gelu } ActType; typedef enum PadMode { Pad_pad, Pad_same, Pad_valid } PadMode; typedef enum RoundingMode { Rounding_No, Rounding_Away_from_zero, Rounding_Up } RoundingMode; typedef enum CalFixedMultiplierMode { diff --git a/mindspore/core/ops/attention.cc b/mindspore/core/ops/attention.cc index e6f26cbb28a66945ad0036ca65b71d1bdbe2ca59..715bb985e285e20099527b848b58112b02087601 100644 --- a/mindspore/core/ops/attention.cc +++ b/mindspore/core/ops/attention.cc @@ -34,7 +34,7 @@ void Attention::set_cross(bool cross) { (void)this->AddAttr(kCross, api::MakeVal void Attention::set_position_bias(bool position_bias) { (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias)); } - +void Attention::set_scale(float scale) { (void)this->AddAttr(kScale, api::MakeValue(scale)); } int64_t Attention::get_head_num() const { auto value_ptr = this->GetAttr(kAttentionNumHeads); return GetValue(value_ptr); @@ -54,12 +54,16 @@ bool Attention::get_position_bias() const { auto value_ptr = this->GetAttr(kPositionBias); return GetValue(value_ptr); } - -void Attention::Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross) { +float Attention::get_scale() const { + auto value_ptr = this->GetAttr(kScale); + return GetValue(value_ptr); +} +void Attention::Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross, float scale) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_cross(cross); this->set_position_bias(position_bias); + this->set_scale(scale); } REGISTER_PRIMITIVE_C(kNameAttention, Attention); } // namespace mindspore::ops diff --git a/mindspore/core/ops/attention.h b/mindspore/core/ops/attention.h index 24b0a98f3f62edfd59b955bc4788042e8ab73d6f..838c04a381715ce30682c6c18bee637dceba7c4c 100644 --- a/mindspore/core/ops/attention.h +++ b/mindspore/core/ops/attention.h @@ -41,15 +41,17 @@ class MIND_API Attention : public BaseOperator { /// \param[in] head_size Define size per head. /// \param[in] cross Define is cross attention. Default false. /// \param[in] position_bias Define is position bias attention. - void Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross = false); + void Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross = false, float scale = 1.0f); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_cross(bool cross); void set_position_bias(bool position_bias); + void set_scale(float scale); int64_t get_head_num() const; int64_t get_head_size() const; bool get_cross() const; bool get_position_bias() const; + float get_scale() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc new file mode 100644 index 0000000000000000000000000000000000000000..91d725c5b1522b24be5515fa767e3078087019a7 --- /dev/null +++ b/mindspore/core/ops/decoder_layer.cc @@ -0,0 +1,129 @@ + +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ops/decoder_layer.h" +#include "ops/primitive_c.h" +#include "ops/op_utils.h" +#include "mindapi/src/helper.h" + +namespace mindspore::ops { +MIND_API_OPERATOR_IMPL(DecoderLayer, BaseOperator); + +void DecoderLayer::set_head_num(int64_t head_num) { + (void)this->AddAttr(kDecoderLayerNumHeads, api::MakeValue(head_num)); +} + +void DecoderLayer::set_head_size(int64_t head_size) { + (void)this->AddAttr(kDecoderLayerSizePerHead, api::MakeValue(head_size)); +} + +void DecoderLayer::set_post_layernorm(bool post_layernorm) { + (void)this->AddAttr(kDecoderLayerPostLayernorm, api::MakeValue(post_layernorm)); +} +void DecoderLayer::set_eps_layernorm1(float eps_layernorm1) { + (void)this->AddAttr(kDecoderLayerEpsLayerNorm1, api::MakeValue(eps_layernorm1)); +} +void DecoderLayer::set_eps_layernorm2(float eps_layernorm2) { + (void)this->AddAttr(kDecoderLayerEpsLayerNorm2, api::MakeValue(eps_layernorm2)); +} +void DecoderLayer::set_eps_layernorm3(float eps_layernorm3) { + (void)this->AddAttr(kDecoderLayerEpsLayerNorm3, api::MakeValue(eps_layernorm3)); +} +void DecoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) { + (void)this->AddAttr(kDecoderLayerFfnHiddenSize, api::MakeValue(ffn_hidden_size)); +} +void DecoderLayer::set_position_bias1(bool position_bias1) { + (void)this->AddAttr(kDecoderLayerPositionBias1, api::MakeValue(position_bias1)); +} +void DecoderLayer::set_position_bias2(bool position_bias2) { + (void)this->AddAttr(kDecoderLayerPositionBias2, api::MakeValue(position_bias2)); +} +void DecoderLayer::set_scale1(float scale1) { (void)this->AddAttr(kDecoderLayerScale1, api::MakeValue(scale1)); } +void DecoderLayer::set_scale2(float scale2) { (void)this->AddAttr(kDecoderLayerScale2, api::MakeValue(scale2)); } +void DecoderLayer::set_act_type(ActType act_type) { (void)this->AddAttr(kActivationType, api::MakeValue(act_type)); } +int64_t DecoderLayer::get_head_num() const { + auto value_ptr = this->GetAttr(kDecoderLayerNumHeads); + return GetValue(value_ptr); +} + +int64_t DecoderLayer::get_head_size() const { + auto value_ptr = this->GetAttr(kDecoderLayerSizePerHead); + return GetValue(value_ptr); +} + +bool DecoderLayer::get_post_layernorm() const { + auto value_ptr = this->GetAttr(kDecoderLayerPostLayernorm); + return GetValue(value_ptr); +} +float DecoderLayer::get_eps_layernorm1() const { + auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm1); + return GetValue(value_ptr); +} +float DecoderLayer::get_eps_layernorm2() const { + auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm2); + return GetValue(value_ptr); +} +float DecoderLayer::get_eps_layernorm3() const { + auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm3); + return GetValue(value_ptr); +} +int64_t DecoderLayer::get_ffn_hidden_size() const { + auto value_ptr = this->GetAttr(kDecoderLayerFfnHiddenSize); + return GetValue(value_ptr); +} +bool DecoderLayer::get_position_bias1() const { + auto value_ptr = this->GetAttr(kDecoderLayerPositionBias1); + return GetValue(value_ptr); +} +bool DecoderLayer::get_position_bias2() const { + auto value_ptr = this->GetAttr(kDecoderLayerPositionBias2); + return GetValue(value_ptr); +} +float DecoderLayer::get_scale1() const { + auto value_ptr = this->GetAttr(kDecoderLayerScale1); + return GetValue(value_ptr); +} +float DecoderLayer::get_scale2() const { + auto value_ptr = this->GetAttr(kDecoderLayerScale2); + return GetValue(value_ptr); +} +ActType DecoderLayer::get_act_type() const { + auto value_ptr = GetAttr(kActivationType); + if (value_ptr == nullptr) { + return ActType::ActType_No; + } + return ActType(GetValue(value_ptr)); +} + +void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, + float eps_layernorm3, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, + bool post_layernorm, float scale1, float scale2, ActType act_type) { + this->set_head_num(head_num); + this->set_head_size(head_size); + this->set_post_layernorm(post_layernorm); + this->set_eps_layernorm1(eps_layernorm1); + this->set_eps_layernorm2(eps_layernorm2); + this->set_eps_layernorm3(eps_layernorm3); + this->set_ffn_hidden_size(ffn_hidden_size); + this->set_position_bias1(position_bias1); + this->set_position_bias2(position_bias2); + this->set_act_type(act_type); + this->set_scale1(scale1); + this->set_scale2(scale2); +} +REGISTER_PRIMITIVE_C(kNameDecoderLayer, DecoderLayer); +} // namespace mindspore::ops diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h new file mode 100644 index 0000000000000000000000000000000000000000..b196689eb2f37d575abd34409e2f7f070439a93e --- /dev/null +++ b/mindspore/core/ops/decoder_layer.h @@ -0,0 +1,103 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CORE_OPS_DECODER_LAYER_H_ +#define MINDSPORE_CORE_OPS_DECODER_LAYER_H_ +#include +#include +#include +#include +#include "nnacl/op_base.h" + +#include "ops/base_operator.h" +#include "mindapi/base/types.h" + +namespace mindspore { +namespace ops { +constexpr auto kNameDecoderLayer = "DecoderLayer"; +/// \brief MultiHead-Attention op in MindIR. +class MIND_API DecoderLayer : public BaseOperator { + public: + MIND_API_BASE_MEMBER(DecoderLayer); + /// \brief Constructor. + DecoderLayer() : BaseOperator(kNameDecoderLayer) { + InitIOName({"input", + "gamma1", + "beta1", + "weight_qkv", + "bias_attn_qkv", + "input_mask", + "weight_attn_o", + "bias_attn_o", + "gamma2", + "beta2", + "encoder_output", + "weight_attn_q", + "weight_attn_kv", + "bias_attn_cross_qkv", + "cross_mask", + "weight_attn_cross_o", + "bias_attn_cross_o", + "gamma3", + "beta3", + "weight_m", + "bias_m", + "weight_p", + "bias_p"}, + {"output"}); + } + /// \brief Initialize DecoderLayer op. + /// \param[in] head_num Define head number. + /// \param[in] head_size Define size per head. + /// \param[in] eps_layernorm1 Define eps layernorm1. + /// \param[in] eps_layernorm2 Define eps layernorm2. + /// \param[in] eps_layernorm3 Define eps layernorm3. + /// \param[in] ffn_hidden_size Define ffn hidden size. + /// \param[in] position_bias1 Define position_bias1. + /// \param[in] position_bias2 Define position_bias2. + /// \param[in] scale1 Define scale1. + /// \param[in] scale2 Define scale2. + /// \param[in] act_type Define act_type. + void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, + int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, bool post_layernorm, float scale1 = 1.0f, + float scale2 = 1.0f, ActType act_type = ActType::ActType_Gelu); + void set_head_num(int64_t head_num); + void set_head_size(int64_t head_size); + void set_post_layernorm(bool post_layernorm); + void set_eps_layernorm1(float eps_layernorm1); + void set_eps_layernorm2(float eps_layernorm2); + void set_eps_layernorm3(float eps_layernorm2); + void set_ffn_hidden_size(int64_t ffn_hidden_size); + void set_position_bias1(bool position_bias1); + void set_position_bias2(bool position_bias2); + void set_scale1(float scale1); + void set_scale2(float scale2); + void set_act_type(ActType act_type); + int64_t get_head_num() const; + int64_t get_head_size() const; + bool get_post_layernorm() const; + float get_eps_layernorm1() const; + float get_eps_layernorm2() const; + float get_eps_layernorm3() const; + int64_t get_ffn_hidden_size() const; + bool get_position_bias1() const; + bool get_position_bias2() const; + float get_scale1() const; + float get_scale2() const; + ActType get_act_type() const; +}; +} // namespace ops +} // namespace mindspore +#endif // MINDSPORE_CORE_OPS_DECODER_LAYER_H_ diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc index 1a2a9f0aa189f9f6209409912683e0f2fa682be0..276d10de2238aa95c752c2c90c0b6c2a234be503 100644 --- a/mindspore/core/ops/encoder_layer.cc +++ b/mindspore/core/ops/encoder_layer.cc @@ -46,7 +46,8 @@ void EncoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) { void EncoderLayer::set_position_bias(bool position_bias) { (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias)); } - +void EncoderLayer::set_scale(float scale) { (void)this->AddAttr(kScale, api::MakeValue(scale)); } +void EncoderLayer::set_act_type(ActType act_type) { (void)this->AddAttr(kActivationType, api::MakeValue(act_type));} int64_t EncoderLayer::get_head_num() const { auto value_ptr = this->GetAttr(kEncoderLayerNumHeads); return GetValue(value_ptr); @@ -77,9 +78,20 @@ bool EncoderLayer::get_position_bias() const { auto value_ptr = this->GetAttr(kPositionBias); return GetValue(value_ptr); } - +float EncoderLayer::get_scale() const { + auto value_ptr = this->GetAttr(kScale); + return GetValue(value_ptr); +} +ActType EncoderLayer::get_act_type() const { + auto value_ptr = GetAttr(kActivationType); + if (value_ptr == nullptr) { + return ActType::ActType_No; + } + return ActType(GetValue(value_ptr)); +} void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, - int64_t ffn_hidden_size, bool position_bias, bool post_layernorm = false) { + int64_t ffn_hidden_size, bool position_bias, bool post_layernorm, float scale, + ActType act_type) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); @@ -87,6 +99,8 @@ void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm this->set_eps_layernorm2(eps_layernorm2); this->set_ffn_hidden_size(ffn_hidden_size); this->set_position_bias(position_bias); + this->set_act_type(act_type); + this->set_scale(scale); } REGISTER_PRIMITIVE_C(kNameEncoderLayer, EncoderLayer); } // namespace mindspore::ops diff --git a/mindspore/core/ops/encoder_layer.h b/mindspore/core/ops/encoder_layer.h index f2f2a9286136efb0a99aae0a720baf985d23b2a6..b0466be467af79f7d7cbea4ac5916ca8dc24a9b8 100644 --- a/mindspore/core/ops/encoder_layer.h +++ b/mindspore/core/ops/encoder_layer.h @@ -22,6 +22,7 @@ #include "ops/base_operator.h" #include "mindapi/base/types.h" +#include "nnacl/op_base.h" namespace mindspore { namespace ops { @@ -42,9 +43,11 @@ class MIND_API EncoderLayer : public BaseOperator { /// \param[in] eps_layernorm1 Define eps layernorm1. /// \param[in] eps_layernorm2 Define eps layernorm2. /// \param[in] ffn_hidden_size Define ffn hidden size. - /// \param[in] position_bias Define ffn position_bias. + /// \param[in] position_bias Define position_bias. + /// \param[in] scale Define scale. + /// \param[in] act_type Define act_type. void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, - bool position_bias, bool post_layernorm); + bool position_bias, bool post_layernorm, float scale = 1.0f, ActType act_type = ActType::ActType_Gelu); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_post_layernorm(bool post_layernorm); @@ -52,6 +55,8 @@ class MIND_API EncoderLayer : public BaseOperator { void set_eps_layernorm2(float eps_layernorm2); void set_ffn_hidden_size(int64_t ffn_hidden_size); void set_position_bias(bool position_bias); + void set_scale(float scale); + void set_act_type(ActType act_type); int64_t get_head_num() const; int64_t get_head_size() const; bool get_post_layernorm() const; @@ -59,6 +64,8 @@ class MIND_API EncoderLayer : public BaseOperator { float get_eps_layernorm2() const; int64_t get_ffn_hidden_size() const; bool get_position_bias() const; + float get_scale() const; + ActType get_act_type() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h index 4fd47b9f0fb424716fa37415e6aa0a39e3f26a89..caf14dc257aa0ddb524c31041716754b1661f3bb 100644 --- a/mindspore/core/ops/op_name.h +++ b/mindspore/core/ops/op_name.h @@ -378,12 +378,24 @@ constexpr auto kSampleNum = "sample_num"; constexpr auto kRoiEndMode = "roi_end_mode"; constexpr auto kUpper = "upper"; constexpr auto kConjugate = "conjugate"; +constexpr auto kScalar = "scalar"; constexpr auto kEncoderLayerNumHeads = "head_num"; constexpr auto kEncoderLayerSizePerHead = "head_size"; constexpr auto kEncoderLayerPostLayernorm = "post_layernorm"; constexpr auto kEncoderLayerFfnHiddenSize = "ffn_hidden_size"; constexpr auto kEncoderLayerEpsLayerNorm1 = "eps_layernorm1"; constexpr auto kEncoderLayerEpsLayerNorm2 = "eps_layernorm2"; +constexpr auto kDecoderLayerNumHeads = "head_num"; +constexpr auto kDecoderLayerSizePerHead = "head_size"; +constexpr auto kDecoderLayerPostLayernorm = "post_layernorm"; +constexpr auto kDecoderLayerFfnHiddenSize = "ffn_hidden_size"; +constexpr auto kDecoderLayerEpsLayerNorm1 = "eps_layernorm1"; +constexpr auto kDecoderLayerEpsLayerNorm2 = "eps_layernorm2"; +constexpr auto kDecoderLayerEpsLayerNorm3 = "eps_layernorm3"; +constexpr auto kDecoderLayerPositionBias1 = "position_bias1"; +constexpr auto kDecoderLayerPositionBias2 = "position_bias2"; +constexpr auto kDecoderLayerScale1 = "scale1"; +constexpr auto kDecoderLayerScale2 = "scale2"; constexpr auto kPositionBias = "position_bias"; constexpr auto KExclusive = "exclusive"; constexpr auto KReverse = "reverse"; diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index e04470c778b3d2f5bb77fe3db5c4d877aebfe68f..3a24ac6fc442ab1abba02277a072bf38a5a62c10 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -395,6 +395,7 @@ table Attention { head_num: long; head_size: long; cross: bool; + scale: float; } table Conv2DBackpropFilterFusion { diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc index 7a8fa84392dbd8f0e9bb26a94f18aa7313a65ee5..2de89604156a294ed803c48418d00379668a776b 100644 --- a/mindspore/lite/src/common/ops/ops_def.cc +++ b/mindspore/lite/src/common/ops/ops_def.cc @@ -395,6 +395,7 @@ OP_SCHEMA_DEF(Attention) OP_ATTR(head_num, long) OP_ATTR(head_size, long); OP_ATTR(cross, bool) +OP_ATTR(scale, float) OP_SCHEMA_DEF_END(Attention) OP_SCHEMA_DEF(Conv2DBackpropFilterFusion) diff --git a/mindspore/lite/src/common/ops/ops_func_declare.h b/mindspore/lite/src/common/ops/ops_func_declare.h index b00ed666fadd54c3ee807f6e9b123723f2b57c04..3b151b36caa415d62711a6108b7f9403d7ac622e 100644 --- a/mindspore/lite/src/common/ops/ops_func_declare.h +++ b/mindspore/lite/src/common/ops/ops_func_declare.h @@ -260,6 +260,8 @@ #include "ops/format_transpose.h" #include "ops/gather_d.h" #include "ops/tensor_scatter_add.h" +#include "ops/decoder_layer.h" +#include "ops/encoder_layer.h" #include "ops/scatter_elements.h" namespace mindspore::lite::ops { diff --git a/mindspore/lite/src/common/ops/populate/custom_populate.cc b/mindspore/lite/src/common/ops/populate/custom_populate.cc index 4f855ce999ad0d00faac597187556aca99e92d95..e35357ee67319b9728fa95b4d4bff63f72bdf837 100644 --- a/mindspore/lite/src/common/ops/populate/custom_populate.cc +++ b/mindspore/lite/src/common/ops/populate/custom_populate.cc @@ -107,6 +107,16 @@ OpParameter *PopulateCustomParameter(const void *prim) { memset(param, 0, sizeof(OpParameter)); param->type_ = PrimType_Inner_EncoderLayer; return reinterpret_cast(param); + } else if (type == "DecoderLayer") { + std::cout << "DecoderLayer populate" << std::endl; + auto *param = reinterpret_cast(malloc(sizeof(OpParameter))); + if (param == nullptr) { + MS_LOG(ERROR) << "malloc DecoderLayer failed."; + return nullptr; + } + memset(param, 0, sizeof(OpParameter)); + param->type_ = PrimType_Inner_DecoderLayer; + return reinterpret_cast(param); } else { MS_LOG(ERROR) << "Unsupported custom type: " << type; } diff --git a/mindspore/lite/src/common/prim_util.cc b/mindspore/lite/src/common/prim_util.cc index b7276233b763a1294b8051b96d1e3b1af9488483..c5be3ff571493dbcf24acb73519c37fd3f4868fc 100644 --- a/mindspore/lite/src/common/prim_util.cc +++ b/mindspore/lite/src/common/prim_util.cc @@ -28,9 +28,9 @@ static std::set kTensorListOps = { schema::PrimitiveType_TensorListReserve, schema::PrimitiveType_TensorListSetItem, schema::PrimitiveType_TensorListStack}; -static const char *const kInnerOpNames[6] = { +static const char *const kInnerOpNames[8] = { "Inner_ToFormat", "Inner_GltextureToOpencl", "Inner_Identity", - "Inner_ShapeFusion", "Inner_GraphKernel", "Inner_SplitReduceConcatFusion", + "Inner_ShapeFusion", "Inner_GraphKernel", "Inner_SplitReduceConcatFusion", "Inner_EncoderLayer" ,"Inner_DecoderLayer", }; int GetPrimitiveType(const void *primitive, int schema_version) { if (primitive == nullptr) { diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc new file mode 100755 index 0000000000000000000000000000000000000000..22f0e43a091785902996e1679beec86cc89701fd --- /dev/null +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -0,0 +1,262 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h" +#include +#include +#include +#include +#include +#include +#include +#include "NvInferRuntimeCommon.h" +#include "ops/decoder_layer.h" +#include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include "src/fastertransformer/kernels/activation_kernels.h" +#include "src/fastertransformer/utils/cuda_utils.h" +#include "src/fastertransformer/utils/allocator.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" + +namespace mindspore::lite { +namespace { +constexpr std::size_t kTwo = 2; +} // namespace + +int DecoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (in_tensors.size() != C23NUM && in_tensors.size() != C16NUM) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} +nvinfer1::ITensor *DecoderTensorRT::castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, + const std::string &op_name) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor"; + return nullptr; + } + nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape()); + if (dims.nbDims == -1) { + MS_LOG(INFO) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar."; + dims.nbDims = 1; + dims.d[0] = 1; + } + nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType()); + if (!ms_tensor.IsConst()) { + MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name(); + return nullptr; + } + nvinfer1::Weights weights{data_type, ms_tensor.Data(), ms_tensor.ElementNum()}; + if (data_type == nvinfer1::DataType::kFLOAT && is_ffn_fp16_) { + void *data_float16 = malloc(ms_tensor.ElementNum() * sizeof(float)); + if (data_float16 == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return nullptr; + } + auto src = static_cast(ms_tensor.Data()); + auto dst = static_cast(data_float16); + for (int i = 0; i < ms_tensor.ElementNum(); i++) { + dst[i] = static_cast(src[i]); + } + weights.values = data_float16; + } + nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); + if (constant_tensor == nullptr) { + MS_LOG(ERROR) << "create constant_tensor failed."; + return nullptr; + } + ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name); + auto tensor_ptr = constant_tensor->getOutput(0); + return tensor_ptr; +} +int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + auto decoder_op = AsOps(); + if (decoder_op == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + fastertransformer::decoderParamRun params; + cublasHandle_t cublas_handle = GetCublasHandle(); + params.common_param.cublas_handle = cublas_handle; + params.common_param.head_num = decoder_op->get_head_num(); + params.common_param.head_size = decoder_op->get_head_size(); + params.common_param.hidden_size = params.common_param.head_num * params.common_param.head_size; + params.decoder.layernorm_post = decoder_op->get_post_layernorm(); + params.decoder.eps1 = decoder_op->get_eps_layernorm1(); + params.decoder.eps2 = decoder_op->get_eps_layernorm2(); + params.decoder.eps3 = decoder_op->get_eps_layernorm3(); + params.ffn_param.ffn_param.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); + params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_; + params.ffn_param.ffn_param.act_type = (fastertransformer::ActType)(decoder_op->get_act_type()); + params.attn1.attn.position_bias = decoder_op->get_position_bias1(); + params.ffn_param.ffn_param.ffn_bias = !params.attn1.attn.position_bias; + params.attn1.attn.qkv_bias = !params.attn1.attn.position_bias; + params.attn1.attn.projection_bias = !params.attn1.attn.position_bias; + params.attn1.attn.is_cross = false; + params.attn1.attn.scale = decoder_op->get_scale1(); + params.attn1.attn.mask = true; + params.attn2.attn.position_bias = decoder_op->get_position_bias2(); + params.attn2.attn.qkv_bias = !params.attn2.attn.position_bias; + params.attn2.attn.projection_bias = !params.attn2.attn.position_bias; + params.attn2.attn.is_cross = true; + params.attn2.attn.scale = decoder_op->get_scale2(); + params.attn2.attn.mask = true; + params.decoder.has_beta = !params.attn1.attn.position_bias; + auto compute_type = runtime_->GetRuntimePrecisionMode(); + if (is_ffn_fp16_) { + size_t start_fp16 = (params.attn1.attn.position_bias) ? C13NUM : C18NUM; + size_t end_fp16 = (params.attn1.attn.position_bias) ? C16NUM : C22NUM; + for (size_t i = 0; i < in_tensors_.size(); i++) { + auto in_tensor = input(ctx, i); + if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { + if (i > start_fp16 && i < end_fp16) { + in_tensor.trt_tensor_ = castTensor(ctx, in_tensors_[i], op_name_); + ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); + } else { + in_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx, in_tensors_[i], op_name_); + ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); + } + } + } + } + nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; + auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, device_id_); + const int input_number = inputs().size(); + nvinfer1::ITensor *inputTensors[input_number]; + for (int i = 0; i < input_number; i++) { + inputTensors[i] = input(ctx, i).trt_tensor_; + } + nvinfer1::IPluginV2Layer *decoder_layer = ctx->network()->addPluginV2(inputTensors, input_number, *plugin); + if (decoder_layer == nullptr) { + MS_LOG(ERROR) << "add decoder op failed for TensorRT."; + return RET_ERROR; + } + decoder_layer->setName((op_name_ + "plugin_decoder_layer").c_str()); + nvinfer1::ITensor *decoder_tensor = decoder_layer->getOutput(0); + ctx->RegisterTensor(ITensorHelper{decoder_tensor, Format::NCHW, true}, out_tensors_[0].Name()); + this->layer_ = decoder_layer; + return RET_OK; +} + +REGISTER_TENSORRT_PLUGIN(DecoderPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int DecoderPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + if (compute_type_ == RuntimePrecisionMode_FP16) { + return RunCudaDecoder(inputDesc, outputDesc, inputs, outputs, workspace, stream, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + } else { + return RunCudaDecoder(inputDesc, outputDesc, inputs, outputs, workspace, stream, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + } +} +template +int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, + void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { + params_.common_param.algo = algoId; + params_.common_param.stream = stream; + void *inputs_forward[num_of_inputs_]; + for (int i = 0; i < num_of_inputs_; i++) { + inputs_forward[i] = const_cast(inputs[i]); + } + void *outputs_forward[] = {outputs[0]}; + fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, + workspace); + return RET_OK; +} + +bool DecoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, + int nbOutputs) noexcept { + auto type = (compute_type_ == RuntimePrecisionMode_FP16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; + for (int i = 0; i < pos; i++) { + if (tensorsDesc[pos].type != tensorsDesc[i].type) return false; + } + bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type); + return res; +} + +void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { + const int request_batch_size = static_cast(in[0].desc.dims.d[0]); + const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); + const int request_tgt_seq_len = request_src_seq_len; + params_.common_param.batch_size = request_batch_size; + params_.common_param.src_seq_len = request_src_seq_len; + params_.common_param.tgt_seq_len = request_tgt_seq_len; + num_of_inputs_ = nbInputs; + num_of_outputs_ = nbOutputs; +} +size_t DecoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { + if (compute_type_ == RuntimePrecisionMode_FP16) { + return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); + } else { + return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); + } +} + +nvinfer1::DimsExprs DecoderPlugin::getOutputDimensions(int32_t index, const nvinfer1::DimsExprs *inputs, + int nbInputDims, nvinfer1::IExprBuilder &exprBuilder) noexcept { + nvinfer1::DimsExprs dims; + if (index == 0) { + int num_dims = inputs[0].nbDims; + dims.nbDims = num_dims; + for (int i = 0; i < num_dims; i++) { + dims.d[i] = exprBuilder.constant(inputs[index].d[i]->getConstantValue()); + } + } + return dims; +} + +nvinfer1::IPluginV2DynamicExt *DecoderPlugin::clone() const noexcept { + auto *plugin = new DecoderPlugin(*this); + if (plugin == nullptr) { + MS_LOG(ERROR) << "plugin is null"; + return nullptr; + } + plugin->setPluginNamespace(name_space_.c_str()); + plugin->params_.attn1.common_param = &plugin->params_.common_param; + plugin->params_.attn2.common_param = &plugin->params_.common_param; + plugin->params_.ffn_param.common_param = &plugin->params_.common_param; + return plugin; +} + +size_t DecoderPlugin::getSerializationSize() const noexcept { + return sizeof(int) + sizeof(fastertransformer::decoderParamRun); +} + +void DecoderPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &compute_type_, sizeof(int)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::decoderParamRun)); +} +REGISTER_TENSORRT_CREATOR(ops::kNameDecoderLayer, DecoderTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h new file mode 100644 index 0000000000000000000000000000000000000000..d9d5f458383c35a1033c70c07431140438f62b9a --- /dev/null +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -0,0 +1,110 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_ + +#include +#include +#include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" +#include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" +#include "src/fastertransformer/layers/ms_layers/decoder.h" + +namespace mindspore::lite { +class DecoderTensorRT : public TensorRTOp { + public: + DecoderTensorRT(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, + const std::vector &out_tensors, std::string name) + : TensorRTOp(base_operator, in_tensors, out_tensors, name) {} + + ~DecoderTensorRT() override = default; + bool IsWeightInputHanledInner() const override { return is_ffn_fp16_; } + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); + bool is_ffn_fp16_ = false; +}; + +constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"}; +class DecoderPlugin : public TensorRTPlugin { + public: + DecoderPlugin(const std::string name, int compute_type, fastertransformer::decoderParamRun params, uint32_t device_id) + : TensorRTPlugin(name, std::string(DECODER_PLUGIN_NAME), device_id), compute_type_(compute_type) { + params_ = params; + params_.attn1.common_param = ¶ms_.common_param; + params_.attn2.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; + } + + DecoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + compute_type_ = static_cast(fields[0].data)[0]; + params_ = static_cast(fields[1].data)[0]; + params_.attn1.common_param = ¶ms_.common_param; + params_.attn2.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; + } + + DecoderPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::decoderParamRun)); + params_.attn1.common_param = ¶ms_.common_param; + params_.attn2.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; + } + + DecoderPlugin() = delete; + + ~DecoderPlugin() override {} + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs *inputs, int nbInputDims, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override; + bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, + int nbOutputs) noexcept override; + + private: + std::string name_space_; + int compute_type_; + mutable fastertransformer::decoderParamRun params_; + int num_of_inputs_; + int num_of_outputs_; + template + int RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, + cublasGemmAlgo_t algoId); +}; +class DecoderPluginCreater : public TensorRTPluginCreater { + public: + DecoderPluginCreater() : TensorRTPluginCreater(std::string(DECODER_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_ \ No newline at end of file diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc old mode 100644 new mode 100755 index 398ccea3fb9ed0f121babb1b6f43b134bd1972f2..7eb3653cceb163a858b4f673208f523257b6f1aa --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -30,22 +30,26 @@ #include "src/fastertransformer/utils/cuda_utils.h" #include "src/fastertransformer/utils/allocator.h" #include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" namespace mindspore::lite { namespace { constexpr std::size_t kTwo = 2; -constexpr std::size_t kThree = 3; } // namespace -// Multi Head Attention TensorRT op int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { - if (in_tensors.size() != C14NUM) { + if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM && in_tensors.size() != C13NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } return RET_OK; } + nvinfer1::ITensor *EncoderTensorRT::castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name) { if (ctx == nullptr || ctx->network() == nullptr) { @@ -97,25 +101,38 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { MS_LOG(ERROR) << "op action convert failed"; return RET_ERROR; } - fastertransformer::encoderParamT params; - memset_s(¶ms, sizeof(params), 0, sizeof(params)); - params.head_num = encoder_op->get_head_num(); - params.head_size = encoder_op->get_head_size(); - params.layernorm_post = encoder_op->get_post_layernorm(); - params.eps1 = encoder_op->get_eps_layernorm1(); - params.eps2 = encoder_op->get_eps_layernorm2(); - params.ffn_hidden_size = encoder_op->get_ffn_hidden_size(); - params.is_cross = false; - params.ffn_fp16 = is_ffn_fp16_; - params.position_bias = encoder_op->get_position_bias(); - params.cublas_handle = GetCublasHandle(); - params.qkv_bias = !params.position_bias; - params.projection_bias = !params.position_bias; - params.hidden_size = params.head_num * params.head_size; + cublasHandle_t cublas_handle = GetCublasHandle(); + fastertransformer::encoderParamRun params; + //update commonparam + params.common_param.cublas_handle =cublas_handle; + params.common_param.head_num = encoder_op->get_head_num(); + params.common_param.head_size = encoder_op->get_head_size(); + params.common_param.hidden_size = params.common_param.head_num * params.common_param.head_size; + //connect commonparam to attention and ffn + + //update encoder_param_ + params.encoder.layernorm_post = encoder_op->get_post_layernorm(); + params.encoder.eps1 = encoder_op->get_eps_layernorm1(); + params.encoder.eps2 = encoder_op->get_eps_layernorm2(); + params.ffn_param.ffn_param.ffn_hidden_size = encoder_op->get_ffn_hidden_size(); + params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_; + params.attn.attn.is_cross = false; + params.attn.attn.position_bias = encoder_op->get_position_bias(); + params.attn.attn.projection_bias = !params.attn.attn.position_bias; + params.attn.attn.qkv_bias = !params.attn.attn.position_bias; + params.encoder.has_beta = !params.attn.attn.position_bias; + params.ffn_param.ffn_param.ffn_bias = !params.attn.attn.position_bias; + params.attn.attn.mask = true; + params.ffn_param.ffn_param.act_type = (fastertransformer::ActType)(encoder_op->get_act_type()); + params.attn.attn.scale = encoder_op->get_scale(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { - size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; - size_t end_fp16 = (params.layernorm_post) ? C11NUM : C13NUM; + size_t start_fp16 = (params.encoder.layernorm_post) ? C7NUM : C9NUM; + size_t end_fp16 = (params.encoder.layernorm_post) ? C11NUM : C13NUM; + if (params.attn.attn.position_bias) { + start_fp16 = C6NUM; + end_fp16 = C9NUM; + } for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { @@ -131,7 +148,7 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + std::make_shared(input_tensor->getName(), compute_type, params, device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -172,14 +189,12 @@ template int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - params_.stream = stream; - params_.algo = algoId; - void *inputs_forward[] = { - const_cast(inputs[0]), const_cast(inputs[1]), const_cast(inputs[2]), - const_cast(inputs[3]), const_cast(inputs[4]), const_cast(inputs[5]), - const_cast(inputs[6]), const_cast(inputs[7]), const_cast(inputs[8]), - const_cast(inputs[9]), const_cast(inputs[10]), const_cast(inputs[11]), - const_cast(inputs[12]), const_cast(inputs[13])}; + params_.common_param.algo = algoId; + params_.common_param.stream = stream; + void *inputs_forward[num_of_inputs_]; + for (int i = 0; i < num_of_inputs_; i++) { + inputs_forward[i] = const_cast(inputs[i]); + } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardEncoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, workspace); @@ -195,14 +210,15 @@ bool EncoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTen bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type); return res; } + void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { const int request_batch_size = static_cast(in[0].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); const int request_tgt_seq_len = request_src_seq_len; - params_.batch_size = request_batch_size; - params_.src_seq_len = request_src_seq_len; - params_.tgt_seq_len = request_tgt_seq_len; + params_.common_param.batch_size = request_batch_size; + params_.common_param.src_seq_len = request_src_seq_len; + params_.common_param.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } @@ -221,13 +237,8 @@ nvinfer1::DimsExprs EncoderPlugin::getOutputDimensions(int32_t index, const nvin if (index == 0) { int num_dims = inputs[0].nbDims; dims.nbDims = num_dims; - if (num_dims == INPUT_SIZE2) { - dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); - dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); - } else if (num_dims == INPUT_SIZE3) { - dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); - dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); - dims.d[kTwo] = exprBuilder.constant(inputs[0].d[kTwo]->getConstantValue()); + for(int i = 0; i < num_dims; i++ ) { + dims.d[i] = exprBuilder.constant(inputs[index].d[i]->getConstantValue()); } } return dims; @@ -240,16 +251,18 @@ nvinfer1::IPluginV2DynamicExt *EncoderPlugin::clone() const noexcept { return nullptr; } plugin->setPluginNamespace(name_space_.c_str()); + plugin->params_.attn.common_param = &plugin->params_.common_param; + plugin->params_.ffn_param.common_param = &plugin->params_.common_param; return plugin; } size_t EncoderPlugin::getSerializationSize() const noexcept { - return sizeof(int) + sizeof(fastertransformer::encoderParamT); + return sizeof(int) + sizeof(fastertransformer::encoderParamRun); } void EncoderPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); - SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::encoderParamT)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::encoderParamRun)); } REGISTER_TENSORRT_CREATOR(ops::kNameEncoderLayer, EncoderTensorRT) } // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index 45da8ab88a5837eb5f3aef04b32f84c50310a852..ae6133c0ef40373b1cd58769f61175fe3b300da8 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -22,7 +22,8 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" -#include "src/fastertransformer/layers/encoder_layers/encoder.h" +#include "src/fastertransformer/layers/ms_layers/encoder.h" + namespace mindspore::lite { class EncoderTensorRT : public TensorRTOp { public: @@ -45,27 +46,29 @@ class EncoderTensorRT : public TensorRTOp { constexpr auto ENCODER_PLUGIN_NAME{"EncoderPlugin"}; class EncoderPlugin : public TensorRTPlugin { public: - EncoderPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params, - cublasLtHandle_t cublaslt_handle, uint32_t device_id) - : TensorRTPlugin(name, std::string(ENCODER_PLUGIN_NAME), device_id), - compute_type_(compute_type), - params_(params), - cublaslt_handle_(cublaslt_handle) {} + EncoderPlugin(const std::string name, int compute_type, fastertransformer::encoderParamRun params, uint32_t device_id) + : TensorRTPlugin(name, std::string(ENCODER_PLUGIN_NAME), device_id), compute_type_(compute_type) { + params_ = params; + params_.attn.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; + } EncoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; - params_ = static_cast(fields[1].data)[0]; - cublaslt_handle_ = static_cast(fields[2].data)[0]; + params_ = static_cast(fields[1].data)[0]; + params_.attn.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; } EncoderPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamT)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamRun)); + params_.attn.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; } - EncoderPlugin() = delete; ~EncoderPlugin() override {} @@ -85,11 +88,9 @@ class EncoderPlugin : public TensorRTPlugin { int nbOutputs) noexcept override; private: - const std::string layer_name_; std::string name_space_; int compute_type_; - mutable fastertransformer::encoderParamT params_; - cublasLtHandle_t cublaslt_handle_; + mutable fastertransformer::encoderParamRun params_; int num_of_inputs_; int num_of_outputs_; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index 859bf9dc426c5a1a59cb347c79c75b3522727754..75cb121d986924b271f08975724ddd28f7b3cf85 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -71,18 +71,22 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { bool is_cross = mha_op->get_cross(); bool is_position_bias = mha_op->get_position_bias(); nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; - fastertransformer::encoderParamT params; + fastertransformer::attentionParamRun params; + fastertransformer::CommonParam common_param; + memset_s(&common_param, sizeof(common_param), 0, sizeof(common_param)); memset_s(¶ms, sizeof(params), 0, sizeof(params)); - params.head_num = head_number; - params.head_size = head_size; - params.hidden_size = head_number * head_size; - params.cublas_handle = GetCublasHandle(); - params.qkv_bias = !is_position_bias; - params.projection_bias = !is_position_bias; - params.is_cross = is_cross; - params.position_bias = is_position_bias; - auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + cublasHandle_t cublas_handle = GetCublasHandle(); + common_param.cublas_handle = cublas_handle; + common_param.head_num = head_number; + common_param.head_size = head_size; + common_param.hidden_size = head_number * head_size; + params.attn.qkv_bias = !is_position_bias; + params.attn.projection_bias = !is_position_bias; + params.attn.is_cross = is_cross; + params.attn.position_bias = is_position_bias; + params.attn.scale = mha_op->get_scale(); + params.attn.mask = true; + auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, common_param, device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -95,39 +99,8 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { } mha_layer->setName((op_name_ + "plugin_attention").c_str()); nvinfer1::ITensor *attn_tensor = mha_layer->getOutput(0); -#ifndef TEST_ ctx->RegisterTensor(ITensorHelper{attn_tensor, Format::NCHW, true}, out_tensors_[0].Name()); -#else /* TEST_ */ - ctx->RegisterTensor(ITensorHelper{attn_tensor, Format::NCHW, true}, out_tensors_[0].Name() + "attn"); -#endif /* TEST_ */ this->layer_ = mha_layer; -#ifdef TEST_ - auto weight_projection = input(ctx, 4).trt_tensor_; - auto bias_projection = input(ctx, 6).trt_tensor_; -#endif /* TEST_ */ - -#ifdef TEST_ - auto matmul_layer = ctx->network()->addMatrixMultiply(*attn_tensor, nvinfer1::MatrixOperation::kNONE, - *weight_projection, nvinfer1::MatrixOperation::kNONE); - if (matmul_layer == nullptr) { - MS_LOG(ERROR) << "failed to add matmul layer"; - return RET_ERROR; - } - matmul_layer->setName((op_name_ + "_matmul").c_str()); - auto matmul_tensor = matmul_layer->getOutput(0); - auto shuffle_layer = ctx->network()->addShuffle(*bias_projection); - const auto size = bias_projection->getDimensions().d[0]; - shuffle_layer->setReshapeDimensions(nvinfer1::Dims{2, {1, size}}); - auto shuffle_tensor = shuffle_layer->getOutput(0); - auto addbias = ctx->network()->addElementWise(*matmul_tensor, *shuffle_tensor, nvinfer1::ElementWiseOperation::kSUM); - if (addbias == nullptr) { - MS_LOG(ERROR) << "failed to add bias layer"; - return RET_ERROR; - } - addbias->setName((op_name_ + "_bias").c_str()); - auto bias_out = addbias->getOutput(0); - ctx->RegisterTensor(ITensorHelper{bias_out, Format::NCHW, true}, out_tensors_[0].Name()); -#endif /* TEST_ */ return RET_OK; } @@ -152,36 +125,36 @@ template int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - int cross_tensor_offset = (params_.is_cross) ? 1 : 0; + int cross_tensor_offset = (params_.attn.is_cross) ? 1 : 0; const int weight_projection_tensor_idx = 4 + cross_tensor_offset; const int bias_projection_tensor_idx = 6 + cross_tensor_offset; const int attn_mask_tensor_idx = 7 + cross_tensor_offset; const int bias_qkv_tensor_idx = 5 + cross_tensor_offset; const int weight_qkv_tensor_idx = 3; const int position_bias_tensor_idx = 6 + cross_tensor_offset; - params_.stream = stream; - params_.algo = algoId; + common_param_.algo = algoId; + common_param_.stream = stream; void *inputs_attn[num_of_inputs_]; int index = 0; inputs_attn[index++] = const_cast(inputs[0]); - if (params_.is_cross) { + if (params_.attn.is_cross) { inputs_attn[index++] = const_cast(inputs[1]); inputs_attn[index++] = const_cast(inputs[weight_qkv_tensor_idx]); inputs_attn[index++] = const_cast(inputs[weight_qkv_tensor_idx + 1]); } else { inputs_attn[index++] = const_cast(inputs[weight_qkv_tensor_idx]); } - if (params_.qkv_bias) { + if (params_.attn.qkv_bias) { inputs_attn[index++] = const_cast(inputs[bias_qkv_tensor_idx]); } - if (params_.position_bias) { + if (params_.attn.position_bias) { inputs_attn[index++] = const_cast(inputs[position_bias_tensor_idx]); inputs_attn[index++] = const_cast(inputs[attn_mask_tensor_idx - C2NUM]); } else { inputs_attn[index++] = const_cast(inputs[attn_mask_tensor_idx]); } inputs_attn[index++] = const_cast(inputs[weight_projection_tensor_idx]); - if (params_.projection_bias) { + if (params_.attn.projection_bias) { inputs_attn[index++] = const_cast(inputs[bias_projection_tensor_idx]); } void *outputs_attn[] = {outputs[0]}; @@ -204,15 +177,15 @@ void MhaPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { int cross_tensor_offset = 0; int position_bias_tensor_offsets = 0; - if (params_.is_cross) cross_tensor_offset = 1; - if (params_.position_bias) position_bias_tensor_offsets = 1; + if (params_.attn.is_cross) cross_tensor_offset = 1; + if (params_.attn.position_bias) position_bias_tensor_offsets = 1; const int attn_mask_tensor_idx = 7 + cross_tensor_offset - position_bias_tensor_offsets; const int request_batch_size = static_cast(in[attn_mask_tensor_idx].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[attn_mask_tensor_idx].desc.dims.d[1]); const int request_tgt_seq_len = static_cast(in[attn_mask_tensor_idx].desc.dims.d[2]); - params_.batch_size = request_batch_size; - params_.src_seq_len = request_src_seq_len; - params_.tgt_seq_len = request_tgt_seq_len; + common_param_.batch_size = request_batch_size; + common_param_.src_seq_len = request_src_seq_len; + common_param_.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } @@ -230,34 +203,26 @@ nvinfer1::DimsExprs MhaPlugin::getOutputDimensions(int32_t index, const nvinfer1 nvinfer1::IExprBuilder &exprBuilder) noexcept { nvinfer1::DimsExprs dims; if (index == 0) { -#ifndef TEST_ int num_dims = inputs[0].nbDims; dims.nbDims = num_dims; if (num_dims == INPUT_SIZE2) { dims.d[0] = exprBuilder.constant(inputs[nbInputDims - 1].d[0]->getConstantValue() * inputs[nbInputDims - 1].d[1]->getConstantValue()); - auto hidden_size = exprBuilder.constant(params_.head_size * params_.head_num); + auto hidden_size = exprBuilder.constant(common_param_.head_size * common_param_.head_num); dims.d[1] = hidden_size; } else if (num_dims == INPUT_SIZE3) { dims.d[0] = inputs[nbInputDims - 1].d[0]; // batch dims.d[1] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - auto hidden_size = exprBuilder.constant(params_.head_size * params_.head_num); + auto hidden_size = exprBuilder.constant(common_param_.head_size * common_param_.head_num); dims.d[kTwo] = hidden_size; } } else { dims.nbDims = INPUT_SIZE4; dims.d[0] = inputs[nbInputDims - 1].d[0]; // batch - dims.d[1] = exprBuilder.constant(params_.head_num); + dims.d[1] = exprBuilder.constant(common_param_.head_num); dims.d[kTwo] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - dims.d[kThree] = exprBuilder.constant(params_.head_size); - } -#else - dims.nbDims = C2NUM; - dims.d[0] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - auto hidden_size = exprBuilder.constant(head_size_ * head_number_); - dims.d[1] = hidden_size; + dims.d[kThree] = exprBuilder.constant(common_param_.head_size); } -#endif return dims; } @@ -268,6 +233,7 @@ nvinfer1::IPluginV2DynamicExt *MhaPlugin::clone() const noexcept { return nullptr; } plugin->setPluginNamespace(name_space_.c_str()); + plugin->params_.common_param = &plugin->common_param_; return plugin; } @@ -276,12 +242,13 @@ int MhaPlugin::initialize() noexcept { return 0; } void MhaPlugin::terminate() noexcept {} size_t MhaPlugin::getSerializationSize() const noexcept { - return sizeof(int) + sizeof(fastertransformer::encoderParamT); + return sizeof(int) + sizeof(fastertransformer::attentionParamRun) + sizeof(fastertransformer::CommonParam); } void MhaPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); - SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::encoderParamT)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::attentionParamRun)); + SerializeValue(&buffer, &common_param_, sizeof(fastertransformer::CommonParam)); } REGISTER_TENSORRT_CREATOR(ops::kNameAttention, MhaTensorRT) } // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h index d755690927acbbc3fd8f86783e321148732df181..3370d2bc18850115e5c2b3c85d8d056a40d8f978 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h @@ -22,7 +22,8 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" -#include "src/fastertransformer/layers/encoder_layers/encoder.h" +#include "src/fastertransformer/layers/ms_layers/attention.h" +#include "src/fastertransformer/layers/ms_layers/param.h" namespace mindspore::lite { class MhaTensorRT : public TensorRTOp { @@ -43,24 +44,28 @@ class MhaTensorRT : public TensorRTOp { constexpr auto MHA_PLUGIN_NAME{"AttentionPlugin"}; class MhaPlugin : public TensorRTPlugin { public: - MhaPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params, - cublasLtHandle_t cublaslt_handle, uint32_t device_id) - : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id), - compute_type_(compute_type), - params_(params), - cublaslt_handle_(cublaslt_handle) {} + MhaPlugin(const std::string name, int compute_type, fastertransformer::attentionParamRun params, + fastertransformer::CommonParam common_param, uint32_t device_id) + : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id), compute_type_(compute_type) { + params_ = params; + common_param_ = common_param; + } MhaPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; - params_ = static_cast(fields[1].data)[0]; + params_ = static_cast(fields[1].data)[0]; + common_param_ = static_cast(fields[2].data)[0]; + params_.common_param = &common_param_; } MhaPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamT)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::attentionParamRun)); + DeserializeValue(&serialData, &serialLength, &common_param_, sizeof(fastertransformer::CommonParam)); + params_.common_param = &common_param_; } MhaPlugin() = delete; @@ -91,8 +96,9 @@ class MhaPlugin : public TensorRTPlugin { const std::string layer_name_; std::string name_space_; int compute_type_; - mutable fastertransformer::encoderParamT params_; - cublasLtHandle_t cublaslt_handle_; + mutable fastertransformer::attentionParamRun params_; + mutable fastertransformer::CommonParam common_param_; + cublasLtHandle_t *cublaslt_handle_; int num_of_inputs_; int num_of_outputs_; }; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h index 18baf21654fc86a87544b329058b937c58f470af..d112051b9f966f594e7b0f5ecd4c94b711f972f7 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h @@ -216,5 +216,6 @@ void Data2Vector(std::vector *dst, const void *src) { dst->at(i) = static_cast(src_ptr[i]); } } + } // namespace mindspore::lite #endif // MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_TENSORRT_UTILS_H_ diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc index a296d6054aa320c9b2ac3f36540557446ef0f588..aa3a624b3609a9f5ae589e5cdcb8a9ae0e1052a4 100644 --- a/mindspore/lite/tools/converter/anf_transform.cc +++ b/mindspore/lite/tools/converter/anf_transform.cc @@ -52,7 +52,9 @@ #include "tools/optimizer/fusion/tensor_dot_fusion.h" #include "tools/optimizer/fusion/multi_head_attention_fusion.h" #include "tools/optimizer/fusion/encoder_layer_fusion.h" +#include "tools/optimizer/fusion/decoder_layer_fusion.h" #include "tools/optimizer/fusion/glu_fusion.h" + #include "tools/optimizer/fusion/tflite_rel_pos_multi_head_attention_fusion.h" #include "tools/optimizer/fusion/matmul_add_fusion.h" #include "tools/optimizer/fusion/matmul_mul_fusion.h" @@ -323,6 +325,7 @@ int AnfTransform::RunFusionPass(const FuncGraphPtr &old_graph, const std::shared if (param->optimize_transformer) { fusions.push_back(std::make_shared()); fusions.push_back(std::make_shared()); + fusions.push_back(std::make_shared()); } for (size_t index = 0; index < fusions.size(); index++) { auto pass_ptr = fusions.at(index); diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc new file mode 100644 index 0000000000000000000000000000000000000000..cf8276147149ca0033d59c7719716fda733b7039 --- /dev/null +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -0,0 +1,518 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define USE_DEPRECATED_API +#include "tools/optimizer/fusion/decoder_layer_fusion.h" +#include +#include +#include +#include +#include "tools/optimizer/common/gllo_utils.h" +#include "nnacl/op_base.h" +#include "ops/tuple_get_item.h" +#include "tools/common/tensor_util.h" +#include "ops/op_utils.h" + +namespace mindspore::opt { +namespace { +const auto &p1 = std::placeholders::_1; +} // namespace + +bool DecoderLayerFusion::Init() const { + hidden_stats_ = std::make_shared("input"); + MS_CHECK_TRUE_RET(hidden_stats_ != nullptr, false); + encoder_output_ = std::make_shared("input"); + MS_CHECK_TRUE_RET(encoder_output_ != nullptr, false); + beta1_ = std::make_shared("beta1"); + MS_CHECK_TRUE_RET(beta1_ != nullptr, false); + gamma1_ = std::make_shared("gamma1"); + MS_CHECK_TRUE_RET(gamma1_ != nullptr, false); + beta2_ = std::make_shared("beta2"); + MS_CHECK_TRUE_RET(beta2_ != nullptr, false); + gamma2_ = std::make_shared("gamma2"); + MS_CHECK_TRUE_RET(gamma2_ != nullptr, false); + beta3_ = std::make_shared("beta3"); + MS_CHECK_TRUE_RET(beta3_ != nullptr, false); + gamma3_ = std::make_shared("gamma3"); + MS_CHECK_TRUE_RET(gamma3_ != nullptr, false); + weight_attn_qkv_ = std::make_shared("weight_attn_qkv"); + MS_CHECK_TRUE_RET(weight_attn_qkv_ != nullptr, false); + weight_attn_q_ = std::make_shared("weight_attn_q_"); + MS_CHECK_TRUE_RET(weight_attn_q_ != nullptr, false); + weight_attn_kv_ = std::make_shared("weight_attn_kv_"); + MS_CHECK_TRUE_RET(weight_attn_kv_ != nullptr, false); + weight_attn_o_ = std::make_shared(IsParamNode, "weight_attn_o"); + MS_CHECK_TRUE_RET(weight_attn_o_ != nullptr, false); + weight_attn_cross_o_ = std::make_shared(IsParamNode, "weight_attn_cross_o_"); + MS_CHECK_TRUE_RET(weight_attn_cross_o_ != nullptr, false); + weight_m_ = std::make_shared(IsParamNode, "weight_m"); + MS_CHECK_TRUE_RET(weight_m_ != nullptr, false); + weight_p_ = std::make_shared(IsParamNode, "weight_p"); + MS_CHECK_TRUE_RET(weight_p_ != nullptr, false); + bias_attn_qkv_ = std::make_shared("bias_attn_qkv"); + MS_CHECK_TRUE_RET(bias_attn_qkv_ != nullptr, false); + bias_attn_o_ = std::make_shared(IsParamNode, "bias_attn_o"); + MS_CHECK_TRUE_RET(bias_attn_o_ != nullptr, false); + bias_attn_cross_qkv_ = std::make_shared("bias_attn_cross_qkv_"); + MS_CHECK_TRUE_RET(bias_attn_cross_qkv_ != nullptr, false); + bias_attn_cross_o_ = std::make_shared(IsParamNode, "bias_attn_cross_o_"); + MS_CHECK_TRUE_RET(bias_attn_cross_o_ != nullptr, false); + bias_m_ = std::make_shared(IsParamNode, "bias_m"); + MS_CHECK_TRUE_RET(bias_m_ != nullptr, false); + bias_p_ = std::make_shared(IsParamNode, "bias_p"); + MS_CHECK_TRUE_RET(bias_p_ != nullptr, false); + mask_ = std::make_shared("mask"); + MS_CHECK_TRUE_RET(mask_ != nullptr, false); + cross_mask_ = std::make_shared("cross_mask_"); + MS_CHECK_TRUE_RET(cross_mask_ != nullptr, false); + is_attention_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAttention), "is_attention"); + MS_CHECK_TRUE_RET(is_attention_ != nullptr, false); + is_attention_cross_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAttention), "is_attention_cross"); + MS_CHECK_TRUE_RET(is_attention_cross_ != nullptr, false); + is_layernorm1_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm1"); + MS_CHECK_TRUE_RET(is_layernorm1_ != nullptr, false); + is_layernorm2_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm2"); + MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false); + is_layernorm3_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm3"); + MS_CHECK_TRUE_RET(is_layernorm3_ != nullptr, false); + position_bias_ = std::make_shared("position_bias"); + MS_CHECK_TRUE_RET(position_bias_ != nullptr, false); + position_bias_cross_ = std::make_shared("position_bias_cross_"); + MS_CHECK_TRUE_RET(position_bias_ != nullptr, false); + is_act_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimActivation), "activation"); + MS_CHECK_TRUE_RET(is_act_ != nullptr, false); + eps1_ = std::make_shared("eps1_"); + MS_CHECK_TRUE_RET(eps1_ != nullptr, false); + eps2_ = std::make_shared("eps2_"); + MS_CHECK_TRUE_RET(eps2_ != nullptr, false); + eps3_ = std::make_shared("eps3_"); + MS_CHECK_TRUE_RET(eps3_ != nullptr, false); + return true; +} + +VectorRef DecoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusion = false, + bool is_position_bias = false) const { + auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); + MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); + auto var1 = std::make_shared("var1-reshape"); + MS_CHECK_TRUE_RET(var1 != nullptr, {}); + auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); + VectorRef layer_norm, tuple; + if (!layernorm_fusion) { + return DefineLayerNorm(reshape1, gamma1_, beta1_, eps1_); + } + layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_}); + auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); + auto var_tuple = std::make_shared("var_tuple"); + tuple = VectorRef({is_tuple, layer_norm, var_tuple}); + return tuple; +} + +VectorRef DecoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const { + auto is_sqr = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSquare), "sqr2"); + MS_CHECK_TRUE_RET(is_sqr != nullptr, {}); + auto sqr = VectorRef({is_sqr, input}); + auto var1 = std::make_shared("var1"); + MS_CHECK_TRUE_RET(var1 != nullptr, {}); + auto is_reduce = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce"); + MS_CHECK_TRUE_RET(is_reduce != nullptr, {}); + auto reduce = VectorRef({is_reduce, sqr, var1}); + auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is-add"); + MS_CHECK_TRUE_RET(is_add != nullptr, {}); + auto add = VectorRef({is_add, reduce, eps}); + auto is_sqrt = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSqrt), "sqr2"); + MS_CHECK_TRUE_RET(is_sqrt != nullptr, {}); + auto sqrt = VectorRef({is_sqrt, add}); + auto is_div = std::make_shared(std::bind(IsOpType, p1, prim::kPrimRealDiv), "real-div"); + MS_CHECK_TRUE_RET(is_div != nullptr, {}); + auto real_div = VectorRef({is_div, input, sqrt}); + auto is_mul = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMulFusion), "mul"); + MS_CHECK_TRUE_RET(is_mul != nullptr, {}); + auto mul = VectorRef({is_mul, real_div, gamma}); + return mul; +} + +VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, + bool is_position_bias = false, bool mask = true) const { + auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); + MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); + auto var1 = std::make_shared("var1-reshape"); + MS_CHECK_TRUE_RET(var1 != nullptr, {}); + auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); + VectorRef inputs, input_cross, tuple2, tuple3, matmul2, tuple4, tuple5; + if (is_position_bias) { + inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, + position_bias_}); + } else { + inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, + bias_attn_qkv_, bias_attn_o_}); + } + if (mask) inputs.push_back(mask_); + auto attention = VectorRef(inputs); + // return attention; + if (is_position_bias) { + tuple4 = attention; + } else { + auto is_tuple4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item4"); + auto var_tuple4 = std::make_shared("var_tuple4"); + tuple4 = VectorRef({is_tuple4, attention, var_tuple4}); + } + auto is_add2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add2"); + auto add2 = (post_layernorm) + ? VectorRef({is_add2, getTuple(post_layernorm, layernorm_fusion, is_position_bias), tuple4}) + : VectorRef({is_add2, reshape1, tuple4}); + if (layernorm_fusion) { + auto layer_norm2 = VectorRef({is_layernorm2_, add2, gamma2_, beta2_}); + auto is_tuple2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item2"); + auto var_tuple2 = std::make_shared("var_tuple2"); + tuple2 = VectorRef({is_tuple2, layer_norm2, var_tuple2}); + } else { + tuple2 = DefineLayerNorm(add2, gamma2_, beta2_, eps2_); + } + auto is_reshape2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder2"); + MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {}); + auto var2 = std::make_shared("var2"); + MS_CHECK_TRUE_RET(var2 != nullptr, {}); + auto reshape2 = VectorRef({is_reshape2, encoder_output_, var2}); + if (is_position_bias) { + input_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, + weight_attn_cross_o_, position_bias_cross_}); + } else { + input_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, + weight_attn_cross_o_, bias_attn_cross_qkv_, bias_attn_cross_o_}); + } + if (mask) input_cross.push_back(cross_mask_); + auto attention_cross = VectorRef(input_cross); + if (is_position_bias) { + tuple5 = attention_cross; + } else { + auto is_tuple5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item5"); + auto var_tuple5 = std::make_shared("var_tuple5"); + tuple5 = VectorRef({is_tuple5, attention_cross, var_tuple5}); + } + auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); + MS_CHECK_TRUE_RET(is_add2 != nullptr, {}); + auto add3 = (post_layernorm) ? VectorRef({is_add3, tuple2, tuple5}) : VectorRef({is_add3, add2, tuple5}); + if (layernorm_fusion) { + auto layer_norm3 = VectorRef({is_layernorm3_, add3, gamma3_, beta3_}); + auto is_tuple3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item3"); + auto var_tuple3 = std::make_shared("var_tuple3"); + tuple3 = VectorRef({is_tuple3, layer_norm3, var_tuple3}); + } else { + tuple3 = DefineLayerNorm(add3, gamma3_, beta3_, eps3_); + } + auto is_matmul1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul1"); + MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {}); + auto is_matmul2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul2"); + MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {}); + if (!is_position_bias) { + auto matmul1 = VectorRef({is_matmul1, tuple3, weight_m_, bias_m_}); + auto act = VectorRef({is_act_, matmul1}); + matmul2 = VectorRef({is_matmul2, act, weight_p_, bias_p_}); + } else { + auto matmul1 = VectorRef({is_matmul1, tuple3, weight_m_}); + matmul2 = VectorRef({is_matmul2, matmul1, weight_p_}); + } + auto is_reshape3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder3"); + MS_CHECK_TRUE_RET(is_reshape3 != nullptr, {}); + auto var3 = std::make_shared("var3"); + MS_CHECK_TRUE_RET(var3 != nullptr, {}); + auto reshape3 = VectorRef({is_reshape3, matmul2, var3}); + auto is_reshape4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder4"); + MS_CHECK_TRUE_RET(is_reshape4 != nullptr, {}); + auto var4 = std::make_shared("var4"); + MS_CHECK_TRUE_RET(var4 != nullptr, {}); + auto reshape4 = (post_layernorm) ? VectorRef({is_reshape4, tuple3, var4}) : VectorRef({is_reshape4, add3, var4}); + auto is_add4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add4"); + auto add4 = VectorRef({is_add4, reshape4, reshape3}); + return add4; +} + +std::unordered_map DecoderLayerFusion::DefinePatterns() const { + std::unordered_map patterns; + if (!Init()) { + MS_LOG(ERROR) << "initial member failed."; + return patterns; + } + patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, true, false, true); + patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, true, false, true); + patterns[kPatternDecoderLayerNormPre] = DefinePatternDecoderLayer(false, false, false, true); + patterns[kPatternDecoderLayerNormPost] = DefinePatternDecoderLayer(true, false, false, true); + patterns[kPatternDecoderT5Pre] = DefinePatternDecoderLayer(false, false, true, true); + patterns[kPatternDecoderT5Post] = DefinePatternDecoderLayer(true, false, true, true); + return patterns; +} + +AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mindspore::FuncGraphPtr &func_graph, + const mindspore::AnfNodePtr &node, const mindspore::EquivPtr &equiv) const { + if (func_graph == nullptr || node == nullptr || equiv == nullptr) { + return nullptr; + } + if (pattern_name == kPatternDecoderT5Pre || pattern_name == kPatternDecoderT5Post) { + is_position_bias_ = true; + } + if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderLayerPost) { + is_layernorm_fusion_ = true; + } + bool mask = true; + bool post_layernorm = false; + if (pattern_name == kPatternDecoderLayerPost || pattern_name == kPatternDecoderT5Post || + pattern_name == kPatternDecoderLayerNormPost) { + post_layernorm = true; + } + return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, post_layernorm, mask); +} // namespace mindspore::opt + +bool DecoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const { + auto act_input = GetAttribute(func_graph, equiv, is_act_); + MS_ASSERT(act_input != nullptr); + auto act_primitive = ops::GetOperator(act_input); + MS_CHECK_TRUE_RET(act_primitive != nullptr, false); + auto act_primitive_c = act_primitive->GetPrim(); + if (act_primitive_c->GetAttr(ops::kActivationType) == nullptr || + act_primitive->get_activation_type() != mindspore::GELU) { + return false; + } + return true; +} + +AnfNodePtr DecoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + VarPtr node_name) const { + if ((*equiv)[node_name] == nullptr || !utils::isa((*equiv)[node_name])) { + MS_LOG(ERROR) << node_name << "is not AnfNodePtr"; + return nullptr; + } + AnfNodePtr node = utils::cast((*equiv)[node_name]); + MS_ASSERT(node != nullptr); + if (node == nullptr || !utils::isa(node)) { + auto manager = func_graph->manager(); + if (manager == nullptr) { + return nullptr; + } + auto users = manager->node_users(); + auto it = users.find(node); + if (it != users.end()) { + node = it->second.front().first; + } + if (node == nullptr || !utils::isa(node)) { + return nullptr; + } + } + auto cnode = utils::cast(node); + MS_ASSERT(cnode != nullptr); + auto input = cnode->input(0); + return input; +} + +STATUS DecoderLayerFusion::GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const { + if ((*equiv)[node_name] == nullptr || !utils::isa((*equiv)[node_name])) { + MS_LOG(ERROR) << node_name << " is not anfnodeptr"; + return RET_ERROR; + } + AnfNodePtr node = utils::cast((*equiv)[node_name]); + MS_ASSERT(node != nullptr); + if (utils::isa(node)) { + auto value_ptr_node = utils::cast(node); + auto value_node = utils::cast(value_ptr_node->value()); + if (value_node->isa()) { + auto tensor = value_node->cast(); + MS_EXCEPTION_IF_NULL(tensor); + *eps = *reinterpret_cast(tensor->data().data()); + return RET_OK; + } + } + return RET_ERROR; +} + +STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, + int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1, + bool *is_position_bias2, float *scale1, float *scale2) const { + auto attn_input = GetAttribute(func_graph, equiv, is_attention_); + MS_ASSERT(attn_input != nullptr); + auto attn_prim = ops::GetOperator(attn_input); + if (attn_prim->GetAttr(ops::kDecoderLayerNumHeads) != nullptr) { + *head_num = attn_prim->get_head_num(); + } + if (attn_prim->GetAttr(ops::kAttentionSizePerHead) != nullptr) { + *head_size = attn_prim->get_head_size(); + } + if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { + *is_position_bias1 = attn_prim->get_position_bias(); + } + if (attn_prim->GetAttr(ops::kScale) != nullptr) { + *scale1 = attn_prim->get_scale(); + } + if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { + MS_LOG(ERROR) << "is_attention_ is not AnfNodePtr"; + return RET_ERROR; + } + auto attn_cross_input = GetAttribute(func_graph, equiv, is_attention_cross_); + MS_ASSERT(attn_cross_input != nullptr); + auto attn_cross_prim = ops::GetOperator(attn_cross_input); + if (attn_cross_prim->GetAttr(ops::kPositionBias) != nullptr) { + *is_position_bias2 = attn_cross_prim->get_position_bias(); + } + if (attn_cross_prim->GetAttr(ops::kScale) != nullptr) { + *scale2 = attn_cross_prim->get_scale(); + } + if (is_layernorm_fusion_) { + auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); + auto layrn1_prim = ops::GetOperator(layrn1_input); + if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) { + *eps1 = layrn1_prim->get_epsilon(); + } + auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_); + auto layrn2_prim = ops::GetOperator(layrn2_input); + if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { + *eps2 = layrn2_prim->get_epsilon(); + } + auto layrn3_input = GetAttribute(func_graph, equiv, is_layernorm3_); + auto layrn3_prim = ops::GetOperator(layrn3_input); + if (layrn3_prim->GetAttr(ops::kEpsilon) != nullptr) { + *eps3 = layrn3_prim->get_epsilon(); + } + } else { + if (GetEps(equiv, eps1_, eps1) != RET_OK) { + MS_LOG(ERROR) << "not found eps1"; + return RET_ERROR; + } + if (GetEps(equiv, eps2_, eps2) != RET_OK) { + MS_LOG(ERROR) << "not found eps2"; + return RET_ERROR; + } + if (GetEps(equiv, eps3_, eps3) != RET_OK) { + MS_LOG(ERROR) << "not found eps3"; + return RET_ERROR; + } + } + if (!is_position_bias_) { + if (!IsActGELU(func_graph, equiv)) { + return RET_ERROR; + } + act_type_ = ActType::ActType_Gelu; + } else { + act_type_ = ActType::ActType_Relu; + } + return RET_OK; +} + +std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + bool post_layernorm, int64_t ffn_hidden_size) const { + auto decoder_layer_prim = std::make_shared(); + if (decoder_layer_prim == nullptr) { + MS_LOG(ERROR) << "Build decoder layer primitive failed."; + return nullptr; + } + int head_num = 0; + int head_size = 0; + float eps1 = 1e-6; + float eps2 = 1e-6; + float eps3 = 1e-6; + bool is_position_bias1 = false; + bool is_position_bias2 = false; + float scale1 = 1.0f; + float scale2 = 1.0f; + if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1, + &is_position_bias2, &scale1, &scale2)) { + return nullptr; + } + decoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3, ffn_hidden_size, is_position_bias1, is_position_bias2, + post_layernorm, scale1, scale2, act_type_); + return decoder_layer_prim; +} + +CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + const AnfNodePtr &node, bool post_layernorm = true, + bool mask = true) const { + MS_ASSERT(func_graph != nullptr); + MS_ASSERT(equiv != nullptr); + MS_ASSERT(node != nullptr); + auto input = utils::cast((*equiv)[hidden_stats_]); + MS_ASSERT(input != nullptr); + auto encoder_output = utils::cast((*equiv)[encoder_output_]); + MS_ASSERT(encoder_output != nullptr); + AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3, + bias_attn_cross_qkv, bias_attn_cross_o, position_bias_cross; + auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); + auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); + auto weight_attn_q = utils::cast((*equiv)[weight_attn_q_]); + auto weight_attn_kv = utils::cast((*equiv)[weight_attn_kv_]); + auto weight_attn_cross_o = utils::cast((*equiv)[weight_attn_cross_o_]); + auto weight_m = utils::cast((*equiv)[weight_m_]); + auto weight_p = utils::cast((*equiv)[weight_p_]); + if (is_position_bias_) { + position_bias = utils::cast((*equiv)[position_bias_]); + position_bias_cross = utils::cast((*equiv)[position_bias_cross_]); + } else { + bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); + bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); + bias_attn_cross_qkv = utils::cast((*equiv)[bias_attn_cross_qkv_]); + bias_attn_cross_o = utils::cast((*equiv)[bias_attn_cross_o_]); + bias_m = utils::cast((*equiv)[bias_m_]); + bias_p = utils::cast((*equiv)[bias_p_]); + beta1 = utils::cast((*equiv)[beta1_]); + beta2 = utils::cast((*equiv)[beta2_]); + beta3 = utils::cast((*equiv)[beta3_]); + } + auto gamma1 = utils::cast((*equiv)[gamma1_]); + auto gamma2 = utils::cast((*equiv)[gamma2_]); + auto gamma3 = utils::cast((*equiv)[gamma3_]); + input_mask = mask ? utils::cast((*equiv)[mask_]) : nullptr; + auto cross_mask = utils::cast((*equiv)[cross_mask_]); + auto base_shape_ptr = weight_m->Shape(); + MS_EXCEPTION_IF_NULL(base_shape_ptr); + auto input_shape_ptr = base_shape_ptr->cast(); + MS_EXCEPTION_IF_NULL(input_shape_ptr); + auto input_shape = input_shape_ptr->shape(); + MS_ASSERT(input_shape != nullptr); + int ffn_hidden_size = (int64_t)input_shape[1]; + auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size); + MS_CHECK_TRUE_RET(decoder_layer_prim != nullptr, nullptr); + auto decoder_layer_prim_c = decoder_layer_prim->GetPrim(); + MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr); + auto value_node = NewValueNode(decoder_layer_prim_c); + MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); + std::vector new_node_inputs = {value_node, input, gamma1}; + if (is_position_bias_) { + new_node_inputs.insert(new_node_inputs.end(), {weight_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), + {position_bias, weight_attn_o, gamma2, encoder_output, weight_attn_q, weight_attn_kv}); + if (mask) new_node_inputs.push_back(cross_mask); + new_node_inputs.insert(new_node_inputs.end(), + {position_bias_cross, weight_attn_cross_o, gamma3, weight_m, weight_p}); + } else { + new_node_inputs.insert(new_node_inputs.end(), {beta1, weight_qkv, bias_attn_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), {weight_attn_o, bias_attn_o, gamma2, beta2, encoder_output, + weight_attn_q, weight_attn_kv, bias_attn_cross_qkv}); + if (mask) new_node_inputs.push_back(cross_mask); + new_node_inputs.insert(new_node_inputs.end(), + {weight_attn_cross_o, bias_attn_cross_o, gamma3, beta3, weight_m, bias_m, weight_p, bias_p}); + } + auto new_node = func_graph->NewCNode(new_node_inputs); + MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); + auto old_node = node->cast(); + MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); + new_node->set_abstract(old_node->abstract()->Clone()); + new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); + return new_node; +} +} // namespace mindspore::opt diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h new file mode 100644 index 0000000000000000000000000000000000000000..f5faec283e2fca3114817db87edb7e554b6e14e6 --- /dev/null +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -0,0 +1,113 @@ +// /** +// * Copyright 2021 Huawei Technologies Co., Ltd +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// */ +#ifndef MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODER_LAYER_FUSION_H_ +#define MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODER_LAYER_FUSION_H_ + +#include +#include +#include +#include +#include "tools/optimizer/common/multiple_pattern_process_pass.h" +#include "include/common/utils/utils.h" +#include "include/errorcode.h" +#include "ops/decoder_layer.h" +#include "ops/fusion/layer_norm_fusion.h" +#include "ops/fusion/activation.h" +#include "tools/optimizer/fusion/multi_head_attention_fusion.h" + +namespace mindspore { +namespace opt { +class DecoderLayerFusion : public MultiplePatternProcessPass { + public: + explicit DecoderLayerFusion(const std::string &name = "DecoderLayerFusion", bool multigraph = true) + : MultiplePatternProcessPass(name, multigraph) {} + + ~DecoderLayerFusion() override = default; + + AnfNodePtr Process(const std::string &pattern_name, const FuncGraphPtr &, const AnfNodePtr &, + const EquivPtr &) const override; + std::unordered_map DefinePatterns() const override; + + protected: + virtual bool Init() const; + + private: + VectorRef DefinePatternDecoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias, + bool mask) const; + VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; + VectorRef DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const; + CNodePtr CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + const AnfNodePtr &node, bool post_layernorm, bool mask) const; + std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + bool post_layernorm, int64_t ffn_hidden_size) const; + lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, + float *eps1, float *eps2, float *eps3, bool *is_position_bias1, bool *is_position_bias2, + float *scale1, float *scale2) const; + AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; + bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const; + lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; + + protected: + const std::string kPatternDecoderLayerPre = "PatternDecoderLayerPre"; + const std::string kPatternDecoderLayerPost = "PatternDecoderLayerPost"; + const std::string kPatternDecoderLayerNormPre = "kPatternDecoderLayerNormPre"; + const std::string kPatternDecoderLayerNormPost = "kPatternDecoderLayerNormPost"; + const std::string kPatternDecoderT5Pre = "PatternDecoderT5Pre"; + const std::string kPatternDecoderT5Post = "PatternDecoderT5Post"; + mutable VarPtr hidden_stats_{nullptr}; + mutable VarPtr encoder_output_{nullptr}; + mutable VarPtr position_bias_{nullptr}; + mutable VarPtr beta1_{nullptr}; + mutable VarPtr gamma1_{nullptr}; + mutable VarPtr beta2_{nullptr}; + mutable VarPtr gamma2_{nullptr}; + mutable VarPtr gamma3_{nullptr}; + mutable VarPtr beta3_{nullptr}; + mutable VarPtr weight_attn_qkv_{nullptr}; + mutable VarPtr weight_attn_qkv_cross_{nullptr}; + mutable VarPtr weight_attn_o_{nullptr}; + mutable VarPtr weight_m_{nullptr}; + mutable VarPtr weight_p_{nullptr}; + mutable VarPtr bias_attn_qkv_{nullptr}; + mutable VarPtr bias_attn_o_{nullptr}; + mutable VarPtr bias_attn_cross_qkv_{nullptr}; + mutable VarPtr bias_attn_cross_o_{nullptr}; + mutable VarPtr bias_m_{nullptr}; + mutable VarPtr bias_p_{nullptr}; + mutable VarPtr mask_{nullptr}; + mutable VarPtr is_attention_{nullptr}; + mutable VarPtr is_attention_cross_{nullptr}; + mutable VarPtr weight_attn_q_{nullptr}; + mutable VarPtr weight_attn_kv_{nullptr}; + mutable VarPtr weight_attn_cross_o_{nullptr}; + mutable VarPtr position_bias_cross_{nullptr}; + mutable VarPtr cross_mask_{nullptr}; + mutable VarPtr reshape_k_{nullptr}; + mutable VarPtr reshape_v_{nullptr}; + mutable VarPtr is_layernorm1_{nullptr}; + mutable VarPtr is_layernorm2_{nullptr}; + mutable VarPtr is_layernorm3_{nullptr}; + mutable VarPtr is_act_{nullptr}; + mutable VarPtr eps1_{nullptr}; + mutable VarPtr eps2_{nullptr}; + mutable VarPtr eps3_{nullptr}; + mutable bool is_position_bias_{false}; + mutable bool is_layernorm_fusion_{false}; + mutable ActType act_type_{ActType::ActType_No}; +}; +} // namespace opt +} // namespace mindspore +#endif // MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODER_LAYER_FUSION_H_ diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index 233f878781bfe4992f9392ce043bcdb8c32f511c..ada4b7e856ea534fec4812e3eba94f2677231365 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -67,9 +67,13 @@ bool EncoderLayerFusion::Init() const { is_layernorm2_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm2"); MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false); position_bias_ = std::make_shared("position_bias"); - MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false); + MS_CHECK_TRUE_RET(position_bias_ != nullptr, false); is_act_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimActivation), "activation"); MS_CHECK_TRUE_RET(is_act_ != nullptr, {}); + eps1_ = std::make_shared("position_bias"); + MS_CHECK_TRUE_RET(eps1_ != nullptr, false); + eps2_ = std::make_shared("position_bias"); + MS_CHECK_TRUE_RET(eps2_ != nullptr, false); return true; } @@ -80,11 +84,11 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio auto var1 = std::make_shared("var1-reshape"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, input_, var1}); - if (post_layernorm) { + if (post_layernorm && !is_position_bias) { return reshape1; } - if (layernorm_fusion) { - return DefineLayerNorm(is_position_bias, reshape1, gamma1_, beta1_); + if (!layernorm_fusion) { + return DefineLayerNorm(is_position_bias, reshape1, gamma1_, beta1_, eps1_); } auto layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_}); auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); @@ -93,7 +97,8 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio return tuple; } -VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta) const { +VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta, + VarPtr eps) const { auto var1 = std::make_shared("var1"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto is_reduce = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce"); @@ -110,11 +115,9 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef i auto is_reduce2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce2"); MS_CHECK_TRUE_RET(is_reduce2 != nullptr, {}); auto reduce2 = VectorRef({is_reduce2, sqr, var2}); - auto var3 = std::make_shared("var3"); - MS_CHECK_TRUE_RET(var3 != nullptr, {}); auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is-add"); MS_CHECK_TRUE_RET(is_add != nullptr, {}); - auto add = VectorRef({is_add, reduce2, var3}); + auto add = VectorRef({is_add, reduce2, eps}); auto is_sqr2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSqrt), "sqr2"); MS_CHECK_TRUE_RET(is_sqr2 != nullptr, {}); auto sqr2 = VectorRef({is_sqr2, add}); @@ -136,24 +139,27 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef i } VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, - bool is_position_bias = false) const { - VectorRef attention, tuple, tuple2, tuple3, reshape2, matmul1; + bool is_position_bias = false, bool mask = true) const { + VectorRef tuple, tuple2, tuple3, reshape2, matmul1, inputs; auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); auto var1 = std::make_shared("var1"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, input_, var1}); if (!is_position_bias) { - attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, bias_attn_qkv_, bias_attn_o_, mask_}); + inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, + bias_attn_qkv_, bias_attn_o_}); } else { - attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, position_bias_, mask_}); + inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, + position_bias_}); } + // return attention; + if (mask) inputs.push_back(mask_); + auto attention = VectorRef(inputs); if (!is_position_bias) { auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); auto var_tuple = std::make_shared("var_tuple"); @@ -162,14 +168,16 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr tuple = attention; } auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add"); - auto add = VectorRef({is_add, reshape1, tuple}); + auto add = (is_position_bias && post_layernorm) + ? VectorRef({is_add, getTuple(post_layernorm, layernorm_fusion, is_position_bias), tuple}) + : VectorRef({is_add, reshape1, tuple}); if (layernorm_fusion) { - tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_); - } else { auto layer_norm2 = VectorRef({is_layernorm2_, add, gamma2_, beta2_}); auto is_tuple2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item2"); auto var_tuple2 = std::make_shared("var_tuple2"); tuple2 = VectorRef({is_tuple2, layer_norm2, var_tuple2}); + } else { + tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_, eps2_); } auto is_reshape2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder2"); MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {}); @@ -178,9 +186,13 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto is_matmul1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul1"); MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {}); if (is_position_bias) { - reshape2 = VectorRef({is_reshape2, add, var2}); + if (post_layernorm) { + reshape2 = VectorRef({is_reshape2, tuple2, var2}); + } else { + reshape2 = VectorRef({is_reshape2, add, var2}); + } matmul1 = VectorRef({is_matmul1, tuple2, weight_m_}); - } else if (post_layernorm || layernorm_fusion) { + } else if (post_layernorm || !layernorm_fusion) { reshape2 = VectorRef({is_reshape2, tuple2, var2}); matmul1 = VectorRef({is_matmul1, tuple2, weight_m_, bias_m_}); } else { @@ -199,7 +211,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto reshape3 = VectorRef({is_reshape3, matmul2, var3}); auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); auto add3 = VectorRef({is_add3, reshape2, reshape3}); - if (!post_layernorm || layernorm_fusion) { + if (!post_layernorm || !layernorm_fusion) { return add3; } auto is_reshape4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); @@ -208,12 +220,12 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr MS_CHECK_TRUE_RET(var4 != nullptr, {}); auto reshape4 = VectorRef({is_reshape4, add3, var4}); if (layernorm_fusion) { - tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_); - } else { auto layer_norm = VectorRef({is_layernorm1_, reshape4, gamma1_, beta1_}); auto is_tuple3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item3"); auto var_tuple3 = std::make_shared("var_tuple3"); tuple3 = VectorRef({is_tuple3, layer_norm, var_tuple3}); + } else { + tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_, eps1_); } auto is_reshape5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); MS_CHECK_TRUE_RET(is_reshape5 != nullptr, {}); @@ -233,7 +245,8 @@ std::unordered_map EncoderLayerFusion::DefinePatterns() patterns[kPatternEncoderLayerPost] = DefinePatternEncoderLayer(true); patterns[kPatternEncoderLayerPostNorm] = DefinePatternEncoderLayer(true, true); patterns[kPatternEncoderLayerPreNorm] = DefinePatternEncoderLayer(false, true); - patterns[kPatternEncoderLayerT5] = DefinePatternEncoderLayer(false, true, true); + patterns[kPatternEncoderLayerT5Pre] = DefinePatternEncoderLayer(false, false, true, true); + patterns[kPatternEncoderLayerT5Post] = DefinePatternEncoderLayer(true, false, true, true); return patterns; } @@ -242,15 +255,16 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - if (pattern_name == kPatternEncoderLayerPost || pattern_name == kPatternEncoderLayerPostNorm) { - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true); - } else if (pattern_name == kPatternEncoderLayerPre || pattern_name == kPatternEncoderLayerPreNorm) { - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false); - } else if (pattern_name == kPatternEncoderLayerT5) { - is_position_bias_ = true; - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false); - } - return nullptr; + if (pattern_name == kPatternEncoderLayerPostNorm || pattern_name == kPatternEncoderLayerPreNorm) + is_layernorm_fusion_ = true; + if (pattern_name == kPatternEncoderLayerT5Pre || pattern_name == kPatternEncoderLayerT5Post) is_position_bias_ = true; + bool mask = true; + bool post_layernorm = false; + if (pattern_name == kPatternEncoderLayerPost || pattern_name == kPatternEncoderLayerPostNorm || + pattern_name == kPatternEncoderLayerT5Post) + post_layernorm = true; + + return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, post_layernorm, mask); } bool EncoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, @@ -267,6 +281,26 @@ bool EncoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPt return true; } +STATUS EncoderLayerFusion::GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const { + if ((*equiv)[node_name] == nullptr || !utils::isa((*equiv)[node_name])) { + MS_LOG(ERROR) << node_name << " is not anfnodeptr"; + return RET_ERROR; + } + AnfNodePtr node = utils::cast((*equiv)[node_name]); + MS_ASSERT(node != nullptr); + if (utils::isa(node)) { + auto value_ptr_node = utils::cast(node); + auto value_node = utils::cast(value_ptr_node->value()); + if (value_node->isa()) { + auto tensor = value_node->cast(); + MS_EXCEPTION_IF_NULL(tensor); + *eps = *reinterpret_cast(tensor->data().data()); + return RET_OK; + } + } + return RET_ERROR; +} + AnfNodePtr EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const { if ((*equiv)[node_name] == nullptr || !utils::isa((*equiv)[node_name])) { @@ -294,8 +328,9 @@ AnfNodePtr EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, cons auto input = cnode->input(0); return input; } + STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, - int *head_size, float *eps1, float *eps2) const { + int *head_size, float *eps1, float *eps2, float *scale) const { auto attn_input = GetAttribute(func_graph, equiv, is_attention_); MS_ASSERT(attn_input != nullptr); auto attn_prim = ops::GetOperator(attn_input); @@ -308,18 +343,38 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { is_position_bias_ = attn_prim->get_position_bias(); } - auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); - auto layrn1_prim = ops::GetOperator(layrn1_input); - if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) { - *eps1 = layrn1_prim->get_epsilon(); + if (attn_prim->GetAttr(ops::kScale) != nullptr) { + *scale = attn_prim->get_scale(); } - auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_); - auto layrn2_prim = ops::GetOperator(layrn2_input); - if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { - *eps2 = layrn2_prim->get_epsilon(); + if (is_layernorm_fusion_) { + auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); + auto layrn1_prim = ops::GetOperator(layrn1_input); + if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) { + *eps1 = layrn1_prim->get_epsilon(); + } + auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_); + auto layrn2_prim = ops::GetOperator(layrn2_input); + if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { + *eps2 = layrn2_prim->get_epsilon(); + } + } else { + if (GetEps(equiv, eps1_, eps1) != RET_OK) { + MS_LOG(ERROR) << "not found eps1"; + return RET_ERROR; + } + + if (GetEps(equiv, eps2_, eps2) != RET_OK) { + MS_LOG(ERROR) << "not found eps2"; + return RET_ERROR; + } } - if (!IsActGELU(func_graph, equiv, is_act_)) { - return false; + if (!is_position_bias_) { + if (!IsActGELU(func_graph, equiv, is_act_)) { + return RET_ERROR; + } + act_type_ = ActType::ActType_Gelu; + } else { + act_type_ = ActType::ActType_Relu; } return RET_OK; } @@ -333,18 +388,20 @@ std::shared_ptr EncoderLayerFusion::CreatePrim(const FuncGrap } int head_num = 0; int head_size = 0; - float eps1 = 1e-6; - float eps2 = 1e-6; - if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2)) { + float eps1 = 1e-5; + float eps2 = 1e-5; + float scale = 1.0f; + if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &scale)) { return nullptr; } - encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm); + encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm, scale, + act_type_); return encoder_layer_prim; } CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, - const AnfNodePtr &node, - bool post_layernorm = true) const { + const AnfNodePtr &node, bool post_layernorm, + bool mask) const { MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); @@ -364,9 +421,7 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP } auto gamma1 = utils::cast((*equiv)[gamma1_]); auto gamma2 = utils::cast((*equiv)[gamma2_]); - if (mask_) { - input_mask = utils::cast((*equiv)[mask_]); - } + input_mask = mask ? utils::cast((*equiv)[mask_]) : nullptr; auto base_shape_ptr = weight_m->Shape(); MS_EXCEPTION_IF_NULL(base_shape_ptr); auto input_shape_ptr = base_shape_ptr->cast(); @@ -380,24 +435,23 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP MS_CHECK_TRUE_RET(encoder_layer_prim_c != nullptr, nullptr); auto value_node = NewValueNode(encoder_layer_prim_c); MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); - std::vector new_node_inputs; - ParameterPtr c_bias_m_param, c_weight_p_param, c_bias_p_param, c_weight_m_param; + std::vector new_node_inputs = {value_node, input}; if (is_position_bias_) { position_bias = utils::cast((*equiv)[position_bias_]); - if (!post_layernorm) - new_node_inputs = {value_node, input, gamma1, weight_qkv, input_mask, - weight_attn_o, gamma2, weight_m, weight_p, position_bias}; - else - new_node_inputs = {value_node, input, weight_qkv, input_mask, weight_attn_o, - gamma1, weight_m, weight_p, gamma2, position_bias}; + new_node_inputs.insert(new_node_inputs.end(), {gamma1, weight_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), {position_bias, weight_attn_o, gamma2, weight_m, weight_p}); } else { if (!post_layernorm) { - new_node_inputs = {value_node, input, gamma1, beta1, weight_qkv, bias_attn_qkv, input_mask, weight_attn_o, - bias_attn_o, gamma2, beta2, weight_m, bias_m, weight_p, bias_p}; + new_node_inputs.insert(new_node_inputs.end(), {gamma1, beta1, weight_qkv, bias_attn_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), + {weight_attn_o, bias_attn_o, gamma2, beta2, weight_m, bias_m, weight_p, bias_p}); } else { - new_node_inputs = {value_node, input, weight_qkv, bias_attn_qkv, input_mask, - weight_attn_o, bias_attn_o, gamma1, beta1, weight_m, - bias_m, weight_p, bias_p, gamma2, beta2}; + new_node_inputs.insert(new_node_inputs.end(), {weight_qkv, bias_attn_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), {weight_attn_o, bias_attn_o, gamma1, beta1, weight_m, bias_m, + weight_p, bias_p, gamma2, beta2}); } } auto new_node = func_graph->NewCNode(new_node_inputs); diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index 56945b850cc67579517cdc1e1bd2cf767eaf22ff..9f93f60396226caa25d3b40d5b73b63a104f3df1 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -49,16 +49,19 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { const std::string kPatternEncoderLayerPre = "PatternTEncoderLayerPre"; const std::string kPatternEncoderLayerPostNorm = "PatternTEncoderLayerPostNorm"; const std::string kPatternEncoderLayerPreNorm = "PatternTEncoderLayerPreNorm"; - const std::string kPatternEncoderLayerT5 = "PatternEncoderLayerT5"; - VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_) const; - VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; - VectorRef DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta) const; + const std::string kPatternEncoderLayerT5Post = "kPatternEncoderLayerT5Post"; + const std::string kPatternEncoderLayerT5Pre = "kPatternEncoderLayerT5Pre"; + VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_, + bool mask) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; + VectorRef DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const; CNodePtr CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, - const AnfNodePtr &node, bool post_layernorm) const; + const AnfNodePtr &node, bool post_layernorm = true, + bool mask = true) const; AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const VarPtr &input_prim) const; + lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, - float *eps1, float *eps2) const; + float *eps1, float *eps2, float *scale) const; std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const; @@ -83,7 +86,11 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr is_layernorm1_{nullptr}; mutable VarPtr is_layernorm2_{nullptr}; mutable bool is_position_bias_{false}; + mutable bool is_layernorm_fusion_{false}; + mutable ActType act_type_{ActType::ActType_No}; mutable VarPtr is_act_{nullptr}; + mutable VarPtr eps1_{nullptr}; + mutable VarPtr eps2_{nullptr}; }; } // namespace opt } // namespace mindspore diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index 5da662e2abed86a4db5b657fd48da0430071882e..d9b2ed45a1cf7b9602d6a7e3e9a520e66cf2c3ff 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -386,7 +386,7 @@ VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternT5New(bool transpose, return matmul3; } -VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA() const { +VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA(bool mask) const { VectorRef k_embedding, v_embedding; auto q_transpose = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTranspose)); MS_CHECK_TRUE_RET(q_transpose != nullptr, {}); @@ -399,14 +399,21 @@ VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA() const { auto is_matmul1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion)); MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {}); auto matmul1 = VectorRef({is_matmul1, q_embedding, k_embedding}); - auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion)); - MS_CHECK_TRUE_RET(is_add != nullptr, {}); - auto mask = DefineMask(mask_); - MS_CHECK_TRUE_RET(!mask.empty(), {}); - auto add = VectorRef({is_add, mask, matmul1}); - auto is_softmax = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSoftmax)); - MS_CHECK_TRUE_RET(is_softmax != nullptr, {}); - auto softmax = VectorRef({is_softmax, add}); + VectorRef softmax; + if (mask) { + auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion)); + MS_CHECK_TRUE_RET(is_add != nullptr, {}); + auto mask = DefineMask(mask_); + MS_CHECK_TRUE_RET(!mask.empty(), {}); + auto add = VectorRef({is_add, mask, matmul1}); + auto is_softmax = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSoftmax)); + MS_CHECK_TRUE_RET(is_softmax != nullptr, {}); + softmax = VectorRef({is_softmax, add}); + } else { + auto is_softmax = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSoftmax)); + MS_CHECK_TRUE_RET(is_softmax != nullptr, {}); + softmax = VectorRef({is_softmax, matmul1}); + } auto is_matmul2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion)); MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {}); auto matmul2 = VectorRef({is_matmul2, softmax, v_embedding}); @@ -574,6 +581,7 @@ std::unordered_map MultiHeadAttentionFusion::DefinePatte patterns[kMPAWithMaskPatternName] = DefineMPWithMaskPattern(); patterns[kMPAPatternName] = DefineMPWithMaskPattern(false); patterns[kMPAWithMaskPatternNamePA] = DefineMPWithMaskPatternPA(); + patterns[kMPAPatternNamePA] = DefineMPWithMaskPatternPA(false); patterns[kMPAWithMaskPatternNameT5] = DefineMPWithMaskPatternT5(); patterns[kMPAWithMaskPatternNameT5New] = DefineMPWithMaskPatternT5New(false); patterns[kMPAWithMaskPatternNameT5New2] = DefineMPWithMaskPatternT5New(true, true); @@ -605,6 +613,7 @@ bool MultiHeadAttentionFusion::CheckPattern(const EquivPtr &equiv, int *head_num } *head_num = out.at(0); *head_size = out.at(1); + scale_ = 1.0f / sqrtf(*head_size * 1.0f); return true; } @@ -620,10 +629,12 @@ AnfNodePtr MultiHeadAttentionFusion::Process(const std::string &pattern_name, co if (pattern_name == kMPAWithMaskPatternNameT5New || pattern_name == kMPAWithMaskTransposePatternNameT5New || pattern_name == kMPAWithMaskPatternNameT5New2) { t5_x_ = true; + scale_ = (pattern_name == kMPAWithMaskPatternNameT5New2) ? 1.0f : scale_; } return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), true); } - if (pattern_name == kMPAPatternName || pattern_name == kMPAPatternNameSwin1 || pattern_name == kMPAPatternNameSwin2) + if (pattern_name == kMPAPatternName || pattern_name == kMPAPatternNameSwin1 || pattern_name == kMPAPatternNameSwin2 || + pattern_name == kMPAPatternNamePA) return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), false); return nullptr; } @@ -758,7 +769,7 @@ std::shared_ptr MultiHeadAttentionFusion::CreatePrim(const Equiv if (!CheckPattern(equiv, &head_num, &head_size)) { return nullptr; } - attention_prim->Init(head_num, head_size, t5_x_, cross); + attention_prim->Init(head_num, head_size, t5_x_, cross, scale_); return attention_prim; } diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h index ebe365273de2b24443ee432b400879b6f2b98f48..345616ed4aee9b1859074d49ca5bb55d1b446a5a 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h @@ -48,7 +48,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass { private: // define patterns VectorRef DefineMPWithMaskPattern(bool mask = true) const; - VectorRef DefineMPWithMaskPatternPA() const; + VectorRef DefineMPWithMaskPatternPA(bool mask = true) const; VectorRef DefineMPWithMaskPatternT5() const; VectorRef DefineMPWithMaskPatternT5New(bool transpose = true, bool no_div_flag = false) const; VectorRef DefineMPPatternSwin(bool flag = true) const; @@ -91,7 +91,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass { const std::string kMPAWithMaskTransposePatternNameT5New = "MPAWithMaskTransposePatternT5New"; const std::string kMPAPatternNameSwin1 = "MPAPatternNameSwin1"; const std::string kMPAPatternNameSwin2 = "MPAPatternNameSwin2"; - + const std::string kMPAPatternNamePA = "kMPAPatternNamePA"; mutable VarPtr input_q_{nullptr}; mutable VarPtr input_k_{nullptr}; mutable VarPtr input_v_{nullptr}; @@ -120,6 +120,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass { mutable VarPtr k_transpose_{nullptr}; mutable bool t5_x_{false}; + mutable float scale_{true}; }; } // namespace opt } // namespace mindspore diff --git a/third_party/patch/fast_transformer/001-fast_transformer.patch b/third_party/patch/fast_transformer/001-fast_transformer.patch index 8816cc4e9bcd658e9d55dd3e8b6bae5d8541c15c..355db7c1bb0317f4a3571d12b45e48969cce501b 100644 --- a/third_party/patch/fast_transformer/001-fast_transformer.patch +++ b/third_party/patch/fast_transformer/001-fast_transformer.patch @@ -132,7 +132,7 @@ index 8707220..c9369e0 100644 target_link_libraries(trt_fused_multi_head_attention PUBLIC -lcublas -lcudart) set_property(TARGET trt_fused_multi_head_attention PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/CMakeLists.txt b/CMakeLists.txt -index ea21014..f9e08b8 100644 +index ea21014..e3d61e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,9 @@ @@ -199,7 +199,7 @@ index ea21014..f9e08b8 100644 $ $ $ -+ $ ++ $ $ $ $ @@ -221,7 +221,7 @@ index ea21014..f9e08b8 100644 - $ - $ - $ -+ $ ++ $ $ - $ $ @@ -351,10 +351,10 @@ index a60983c..45b5374 100644 diff --git a/deploy.sh b/deploy.sh new file mode 100755 -index 0000000..ac54401 +index 0000000..0e60c1a --- /dev/null +++ b/deploy.sh -@@ -0,0 +1,32 @@ +@@ -0,0 +1,27 @@ +#copy cuda folder (once) +base=`git rev-parse --show-toplevel` +server=10.10.10.174 @@ -371,16 +371,11 @@ index 0000000..ac54401 +shift +rsync -v ${file} ${server}:${file} +echo "file=${file}" -+rsync -v ${base}/../mindspore/trc/transformer/*.fp32 ${server}:${base}/build/bin ++rsync -v ${base}/../mindspore/trc/transformer/*.fp* ${server}:${base}/build/bin +rsync -v ${base}/build/lib/*.so ${server}:${base}/build/lib +# echo "cd ${base}/build/bin/" +command=$(cat <<-ENDM -+<<<<<<< HEAD -+ CUDA_VISIBLE_DEVICES=0 \ -+ NVIDIA_TF32_OVERRIDE=0 \ -+======= + CUDA_VISIBLE_DEVICES=3 \ -+>>>>>>> origin/bert1 + LD_LIBRARY_PATH=${base}/../FasterTransformer:/usr/local/cuda-11.7/lib64 \ + ${file} $@ +ENDM @@ -427,7 +422,7 @@ index cacb09e..5fec0c9 100644 else if (std::is_same::value) { diff --git a/examples/cpp/ms/CMakeLists.txt b/examples/cpp/ms/CMakeLists.txt new file mode 100644 -index 0000000..eb47b5c +index 0000000..33e562b --- /dev/null +++ b/examples/cpp/ms/CMakeLists.txt @@ -0,0 +1,22 @@ @@ -448,23 +443,23 @@ index 0000000..eb47b5c +add_executable(ms_benchmark ms.cc) +if (SPARSITY_SUPPORT) +# target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt transformer-shared) -+target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt GptContextAttentionLayer EncoderLayer) ++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt GptContextAttentionLayer MSLayer) +else() +# target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart transformer-shared) -+target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart GptContextAttentionLayer EncoderLayer) ++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart GptContextAttentionLayer MSLayer) +endif() diff --git a/examples/cpp/ms/initialize.h b/examples/cpp/ms/initialize.h new file mode 100644 -index 0000000..9bcf4eb +index 0000000..8ee1c95 --- /dev/null +++ b/examples/cpp/ms/initialize.h -@@ -0,0 +1,643 @@ +@@ -0,0 +1,969 @@ +#pragma once + -+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" -+#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" -+#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h" -+#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h" ++#include "src/fastertransformer/layers/ms_layers/MSLayerWeight.h" ++#include "src/fastertransformer/layers/ms_layers/MSAttentionLayer.h" ++#include "src/fastertransformer/layers/ms_layers/MSDecoderLayer.h" ++#include "src/fastertransformer/layers/ms_layers/MSEncoderLayer.h" +using namespace fastertransformer; +struct opt_arg { + size_t batch_size; @@ -476,7 +471,11 @@ index 0000000..9bcf4eb + size_t size_per_head; + float eps1; + float eps2; ++ float eps3; ++ bool position_bias1; ++ bool position_bias2; + bool post_layernorm_residual; ++ bool is_ffn_fp16; + bool is_remove_padding; + std::string model_name; + std::string compute_type; @@ -491,7 +490,7 @@ index 0000000..9bcf4eb + std::vector output_tensors; // GPU + std::vector output_python_tensors; // CPU + std::vector w_tensors; -+ BaseAttentionLayer* Attn; ++ MSBaseLayer* Attn; + // +}; +template @@ -501,17 +500,29 @@ index 0000000..9bcf4eb + std::vector output_tensors; // GPU + std::vector output_python_tensors; // CPU + std::vector w_tensors; -+ BaseEncoderLayer* Encoder; ++ MSBaseLayer* Encoder; ++ // ++}; ++template ++struct DecriptorDecoderLayer { ++ std::vector input_tensors; // GPU ++ std::vector input_python_tensors; // CPU ++ std::vector output_tensors; // GPU ++ std::vector output_python_tensors; // CPU ++ std::vector w_tensors; ++ MSBaseLayer* Decoder; + // +}; -+ +typedef enum { -+ MHA_X1 = 1, // AttnIn + AttnMask -+ MHA_X2, // AttnIn + EncOut -- same seq size + AttnMask -+ MHA_CROSS, // AttnIn + EncOut + AttnMAsk -+ MHA_T5, // AttnIn + EncOut + AttnMAsk + position_bias -+ MHA_T5_CROSS, // AttnIn + EncOut + AttnMAsk + position_bias -+ TEL, // transformer encoder layer ++ MHA_X1 = 1, // AttnIn + AttnMask ++ MHA_X2, // AttnIn + EncOut -- same seq size + AttnMask ++ MHA_CROSS, // AttnIn + EncOut + AttnMAsk ++ MHA_T5, // AttnIn + EncOut + AttnMAsk + position_bias ++ MHA_T5_CROSS, // AttnIn + EncOut + AttnMAsk + position_bias ++ TEL, // transformer encoder layer ++ TEL_T5, // transformer encoder layer ++ TDL, ++ TDL_T5, +} MODEL_TEST_ID_E; + +int ModelNum(std::string model_name) @@ -527,11 +538,23 @@ index 0000000..9bcf4eb + } + else if (model_name == "mha_T5") { + return MHA_T5; -+ } else if (model_name == "mha_T5_cross") { ++ } ++ else if (model_name == "mha_T5_cross") { + return MHA_T5_CROSS; -+ } else if (model_name == "transformer_encoder_layer") { ++ } ++ else if (model_name == "transformer_encoder_layer") { + return TEL; -+ } else { ++ } ++ else if (model_name == "transformer_encoder_layer_t5") { ++ return TEL_T5; ++ } ++ else if (model_name == "transformer_decoder_layer") { ++ return TDL; ++ } ++ else if (model_name == "transformer_decoder_layer_t5") { ++ return TDL_T5; ++ } ++ else { + return -1; + } +} @@ -547,37 +570,29 @@ index 0000000..9bcf4eb + + // TODO Nizzan - check if need to be + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ false, //is_cross -+ false, // sparse -+ false); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false, // is_cross ++ false, // sparse ++ false); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, -+ 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0}); + // desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, + // getTensorType(), + // std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, @@ -630,23 +645,21 @@ index 0000000..9bcf4eb + const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; + + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ false, //is_cross -+ false, // sparse -+ false); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false, // is_cross ++ false, // sparse ++ false); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, -+ 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); + + // GPU RESULTS + desc.output_tensors.push_back(Tensor{ @@ -695,55 +708,53 @@ index 0000000..9bcf4eb + const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; + + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ true, //is_cross -+ false, // sparse -+ false); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ true, // is_cross ++ false, // sparse ++ false); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size*opt_a->seq_len, hidden_units}, -+ 0}); + desc.input_tensors.push_back(Tensor{ -+ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size* opt_a->tgt_seq_len, hidden_units}, 0}); ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0}); + + desc.input_tensors.push_back(Tensor{ + MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size*opt_a->seq_len, hidden_units}, -+ 0}); + desc.input_python_tensors.push_back(Tensor{ -+ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size* opt_a->tgt_seq_len, hidden_units}, 0}); ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0}); + + desc.input_python_tensors.push_back(Tensor{ + MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); + -+ + // GPU RESULTS + + desc.output_tensors.push_back(Tensor{ + MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + + desc.output_python_tensors.push_back(Tensor{ + MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); -+ ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + + desc.w_tensors.push_back( + Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); @@ -764,68 +775,67 @@ index 0000000..9bcf4eb + const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; + + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ false, //is_cross -+ false, // sparse -+ true); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false, // is_cross ++ false, // sparse ++ true); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, -+ 0}); -+ -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0}); ++ ++ desc.input_tensors.push_back( ++ Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0}); + ++ desc.input_python_tensors.push_back( ++ Tensor{MEMORY_CPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); + + // GPU RESULTS + + desc.output_tensors.push_back(Tensor{ + MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, ++ // opt_a->tgt_seq_len},0}); + + desc.output_python_tensors.push_back(Tensor{ + MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, ++ // opt_a->tgt_seq_len}, 0}); + + desc.w_tensors.push_back( + Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, 3 * hidden_units}, 0}); @@ -833,88 +843,89 @@ index 0000000..9bcf4eb + Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); +} + -+template ++template +void InitializeAttnT5Cross(opt_arg* opt_a, -+ DecriptorTest &desc, -+ cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ Allocator* allocator) { ++ DecriptorTest& desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ Allocator* allocator) ++{ + const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; + + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ true, //is_cross -+ false, // sparse -+ true); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ true, // is_cross ++ false, // sparse ++ true); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, -+ 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0}); + + desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); -+ -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); ++ getTensorType(), ++ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); ++ ++ desc.input_tensors.push_back( ++ Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0}); ++ ++ desc.input_python_tensors.push_back( ++ Tensor{MEMORY_CPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); + ++ desc.input_python_tensors.push_back( ++ Tensor{MEMORY_CPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); + + // GPU RESULTS + + desc.output_tensors.push_back(Tensor{ + MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, ++ // opt_a->tgt_seq_len},0}); + + desc.output_python_tensors.push_back(Tensor{ + MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, ++ // opt_a->tgt_seq_len}, 0}); + + desc.w_tensors.push_back( + Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); @@ -944,6 +955,8 @@ index 0000000..9bcf4eb + opt_a->eps1, + opt_a->eps2, + opt_a->post_layernorm_residual, ++ false, ++ opt_a->is_ffn_fp16, + stream, + cublas_wrapper, + cublas_handle, @@ -985,6 +998,256 @@ index 0000000..9bcf4eb + Tensor{MEMORY_GPU, getTensorType(), std::vector{opt_a->ffn_hidden_size, opt_a->hidden_size}, 0}); + desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{opt_a->hidden_size}, 0}); +} ++template ++void InitializeEncoderT5(opt_arg* opt_a, ++ DecriptorEncoderLayer& desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ Allocator* allocator) ++{ ++ // const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; ++ const size_t hidden_units = opt_a->hidden_size; ++ // TODO Nizzan - check if need to be ++ desc.Encoder = new MSELayer(opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ opt_a->ffn_hidden_size, ++ opt_a->eps1, ++ opt_a->eps2, ++ opt_a->post_layernorm_residual, ++ true, ++ opt_a->is_ffn_fp16, ++ stream, ++ cublas_wrapper, ++ cublas_handle, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false); // sparse ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{opt_a->hidden_size}, 0}); //g1 ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, 3 * hidden_units}, 0}); //wt ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0});//wp ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{opt_a->hidden_size}, 0});//g2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{opt_a->hidden_size, opt_a->ffn_hidden_size}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{opt_a->ffn_hidden_size, opt_a->hidden_size}, 0}); ++ ++ ++} ++ ++template ++void InitializeDecoder(opt_arg* opt_a, ++ DecriptorDecoderLayer& desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ Allocator* allocator) ++{ ++ const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; ++ std::cout<<"hidden_units: "< ++ desc.Decoder = new MSDLayer(opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ opt_a->ffn_hidden_size, ++ opt_a->eps1, ++ opt_a->eps2, ++ opt_a->eps3, ++ opt_a->post_layernorm_residual, ++ opt_a->position_bias1, ++ opt_a->position_bias2, ++ opt_a->is_ffn_fp16, ++ stream, ++ cublas_wrapper, ++ cublas_handle, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false); // sparse ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU,getTensorType(),std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU,getTensorType(),std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0}); ++ ++ // desc.output_tensors.push_back(Tensor{ ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ // desc.output_python_tensors.push_back(Tensor{ ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{640/4}, 0}); ++ ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{640/4}, 0}); ++ ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0}); //G1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0}); //B1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, 3 * hidden_units}, 0});//wt ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ 3 * hidden_units}, 0});//bt ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0});//wp ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units}, 0});//bp ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//g1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//b2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units , hidden_units * 2}, 0});//bt2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units * 3}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0});//wp2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units}, 0});//bp2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//g3 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//b3 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});//wm ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->ffn_hidden_size}, 0});//bm ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});;//wp ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//bp ++} ++template ++void InitializeDecoderT5(opt_arg* opt_a, ++ DecriptorDecoderLayer& desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ Allocator* allocator) ++{ ++ const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; ++ std::cout<<"hidden_units: "< ++ desc.Decoder = new MSDLayer(opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ opt_a->ffn_hidden_size, ++ opt_a->eps1, ++ opt_a->eps2, ++ opt_a->eps3, ++ opt_a->post_layernorm_residual, ++ opt_a->position_bias1, ++ opt_a->position_bias2, ++ opt_a->is_ffn_fp16, ++ stream, ++ cublas_wrapper, ++ cublas_handle, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false); // sparse ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU,getTensorType(),std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU,getTensorType(),std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ ++ // desc.output_tensors.push_back(Tensor{ ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ // desc.output_python_tensors.push_back(Tensor{ ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{640/4}, 0}); ++ ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{640/4}, 0}); ++ ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0}); //G1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, 3 * hidden_units}, 0});//wt ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0});//wp ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//g1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units , hidden_units * 2}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0});//wp2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//g3 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});//wm ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});;//wp ++} + +template +void Init(opt_arg* opt_a, @@ -999,32 +1262,16 @@ index 0000000..9bcf4eb + InitializeAttn(opt_a, desc, stream, cublas_wrapper, allocator); + break; + case MHA_X2: -+ InitializeAttnX2(opt_a, -+ desc, -+ stream, -+ cublas_wrapper, -+ allocator); ++ InitializeAttnX2(opt_a, desc, stream, cublas_wrapper, allocator); + break; + case MHA_CROSS: -+ InitializeAttnCross(opt_a, -+ desc, -+ stream, -+ cublas_wrapper, -+ allocator); ++ InitializeAttnCross(opt_a, desc, stream, cublas_wrapper, allocator); + break; + case MHA_T5: -+ InitializeAttnT5(opt_a, -+ desc, -+ stream, -+ cublas_wrapper, -+ allocator); ++ InitializeAttnT5(opt_a, desc, stream, cublas_wrapper, allocator); + break; + case MHA_T5_CROSS: -+ InitializeAttnT5Cross(opt_a, -+ desc, -+ stream, -+ cublas_wrapper, -+ allocator); ++ InitializeAttnT5Cross(opt_a, desc, stream, cublas_wrapper, allocator); + break; + default: + break; @@ -1043,13 +1290,37 @@ index 0000000..9bcf4eb + case TEL: + InitializeEncoder(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator); + break; ++ case TEL_T5: ++ InitializeEncoderT5(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator); ++ break; ++ default: ++ break; ++ } ++} ++ ++template ++void InitD(opt_arg* opt_a, ++ DecriptorDecoderLayer& desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ Allocator* allocator) ++{ ++ int model_num = ModelNum(opt_a->model_name); ++ switch (model_num) { ++ case TDL: ++ InitializeDecoder(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator); ++ break; ++ case TDL_T5: ++ InitializeDecoderT5(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator); ++ break; + default: + break; + } +} + +template -+void InitWeight(opt_arg* opt_a, AttentionWeight& attn_weights, std::vector w_tensors) ++void InitWeight(opt_arg* opt_a, AttentionLayerWeight& attn_weights, std::vector w_tensors) +{ + int modelId = ModelNum(opt_a->model_name); + if (modelId == MHA_X1) { @@ -1064,18 +1335,21 @@ index 0000000..9bcf4eb + attn_weights.key_weight.kernel = (const T*)w_tensors[2].data; + attn_weights.attention_output_weight.kernel = (const T*)w_tensors[3].data; + attn_weights.attention_output_weight.bias = (const T*)w_tensors[4].data; -+ } else if (modelId==MHA_T5) { ++ } ++ else if (modelId == MHA_T5) { + attn_weights.query_weight.kernel = (const T*)w_tensors[0].data; + attn_weights.query_weight.bias = nullptr; + attn_weights.attention_output_weight.kernel = (const T*)w_tensors[1].data; + attn_weights.attention_output_weight.bias = nullptr; -+ } else if (modelId==MHA_T5_CROSS) { ++ } ++ else if (modelId == MHA_T5_CROSS) { + attn_weights.query_weight.kernel = (const T*)w_tensors[0].data; + attn_weights.query_weight.bias = nullptr; + attn_weights.key_weight.kernel = (const T*)w_tensors[1].data; + attn_weights.attention_output_weight.kernel = (const T*)w_tensors[2].data; + attn_weights.attention_output_weight.bias = nullptr; -+ } else { ++ } ++ else { + // return ERROR illegal model ! + } +} @@ -1085,10 +1359,10 @@ index 0000000..9bcf4eb +{ + int modelId = ModelNum(opt_a->model_name); + if (modelId == TEL) { -+ encoder_weights.qkv_weight.kernel = (const T*)w_tensors[2].data; -+ encoder_weights.qkv_weight.bias = (const T*)w_tensors[3].data; -+ encoder_weights.attention_layer_output_weight.kernel = (const T*)w_tensors[4].data; -+ encoder_weights.attention_layer_output_weight.bias = (const T*)w_tensors[5].data; ++ encoder_weights.attention.query_weight.kernel = (const T*)w_tensors[2].data; ++ encoder_weights.attention.query_weight.bias = (const T*)w_tensors[3].data; ++ encoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[4].data; ++ encoder_weights.attention.attention_output_weight.bias = (const T*)w_tensors[5].data; + encoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data; + encoder_weights.layernorm1.beta = (const T*)w_tensors[1].data; + encoder_weights.layernorm2.gamma = (const T*)w_tensors[6].data; @@ -1098,16 +1372,63 @@ index 0000000..9bcf4eb + encoder_weights.encoder_output_mapping.bias = (const T*)w_tensors[9].data; + encoder_weights.encoder_output_projection.bias = (const T*)w_tensors[11].data; + } ++ else if (modelId == TEL_T5){ ++ encoder_weights.attention.query_weight.kernel = (const T*)w_tensors[2].data; ++ encoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[3].data; ++ encoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data; ++ encoder_weights.layernorm2.gamma = (const T*)w_tensors[4].data; ++ encoder_weights.encoder_output_mapping.kernel = (const T*)w_tensors[5].data; ++ encoder_weights.encoder_output_projection.kernel = (const T*)w_tensors[6].data; ++ } ++} ++template ++void InitWeightDecoder(opt_arg* opt_a, DecoderLayerWeight& decoder_weights, std::vector w_tensors) ++{ ++ int modelId = ModelNum(opt_a->model_name); ++ if (modelId == TDL) { ++ decoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data; ++ decoder_weights.layernorm1.beta = (const T*)w_tensors[1].data; ++ decoder_weights.attention.query_weight.kernel = (const T*)w_tensors[2].data; ++ decoder_weights.attention.query_weight.bias = (const T*)w_tensors[3].data; ++ decoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[4].data; ++ decoder_weights.attention.attention_output_weight.bias = (const T*)w_tensors[5].data; ++ decoder_weights.layernorm2.gamma = (const T*)w_tensors[6].data; ++ decoder_weights.layernorm2.beta = (const T*)w_tensors[7].data; ++ decoder_weights.cross_attention.query_weight.kernel = (const T*)w_tensors[8].data; ++ decoder_weights.cross_attention.key_weight.kernel = (const T*)w_tensors[9].data; ++ decoder_weights.cross_attention.query_weight.bias = (const T*)w_tensors[10].data; ++ decoder_weights.cross_attention.key_weight.bias = (const T*)w_tensors[10].data; ++ decoder_weights.cross_attention.attention_output_weight.kernel = (const T*)w_tensors[11].data; ++ decoder_weights.cross_attention.attention_output_weight.bias = (const T*)w_tensors[12].data; ++ decoder_weights.layernorm3.gamma = (const T*)w_tensors[13].data; ++ decoder_weights.layernorm3.beta = (const T*)w_tensors[14].data; ++ decoder_weights.decoder_output_mapping.kernel = (const T*)w_tensors[15].data; ++ decoder_weights.decoder_output_mapping.bias = (const T*)w_tensors[16].data; ++ decoder_weights.decoder_output_projection.kernel = (const T*)w_tensors[17].data; ++ decoder_weights.decoder_output_projection.bias = (const T*)w_tensors[18].data; ++ } ++ else if (modelId == TDL_T5) { ++ decoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data; ++ decoder_weights.attention.query_weight.kernel = (const T*)w_tensors[1].data; ++ decoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[2].data; ++ decoder_weights.layernorm2.gamma = (const T*)w_tensors[3].data; ++ decoder_weights.cross_attention.query_weight.kernel = (const T*)w_tensors[4].data; ++ decoder_weights.cross_attention.key_weight.kernel = (const T*)w_tensors[5].data; ++ decoder_weights.cross_attention.attention_output_weight.kernel = (const T*)w_tensors[6].data; ++ decoder_weights.layernorm3.gamma = (const T*)w_tensors[7].data; ++ decoder_weights.decoder_output_mapping.kernel = (const T*)w_tensors[8].data; ++ decoder_weights.decoder_output_projection.kernel = (const T*)w_tensors[9].data; ++ } + else { + // return ERROR illegal model ! + } +} diff --git a/examples/cpp/ms/ms.cc b/examples/cpp/ms/ms.cc new file mode 100644 -index 0000000..2b12bd5 +index 0000000..4ad059a --- /dev/null +++ b/examples/cpp/ms/ms.cc -@@ -0,0 +1,591 @@ +@@ -0,0 +1,671 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * @@ -1124,10 +1445,10 @@ index 0000000..2b12bd5 + * limitations under the License. + */ +#include "examples/cpp/ms/initialize.h" -+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" -+#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" -+#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h" -+#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h" ++// #include "src/fastertransformer/layers/attention_layers/MSLayerWeight.h" ++// #include "src/fastertransformer/layers/ms_layers/MSAttentionLayer.h" ++// #include "src/fastertransformer/layers/ms_layers/MSEncoderLayer.h" ++// #include "src/fastertransformer/layers/ms_layers/MSDecoderLayer.h" +#include "src/fastertransformer/utils/logger.h" +#include +#include @@ -1147,7 +1468,7 @@ index 0000000..2b12bd5 +bool read_args(int argc, char* argv[], opt_arg* opt_a) +{ + int opt; -+ while ((opt = getopt(argc, argv, "b:l:s:t:H:S:p:m:T:W:F:i:w:f:P:e1:e2")) != -1) { ++ while ((opt = getopt(argc, argv, "b:l:s:t:H:S:p:m:T:W:F:i:w:f:P:x:1:2:3")) != -1) { + switch (opt) { + case 'b': + opt_a->batch_size = atoi(optarg); @@ -1188,6 +1509,9 @@ index 0000000..2b12bd5 + case '2': + opt_a->eps2 = atoi(optarg); + break; ++ case '3': ++ opt_a->eps3 = atoi(optarg); ++ break; + case 'P': + if (atoi(optarg) == 1) + opt_a->post_layernorm_residual=true; @@ -1197,8 +1521,14 @@ index 0000000..2b12bd5 + case 'p': + opt_a->is_remove_padding = bool(optarg); + break; -+ case 'i': -+ case 'w': ++ case 'x': ++ if (atoi(optarg) == 1) ++ opt_a->is_ffn_fp16=true; ++ else if (atoi(optarg) == 0) ++ opt_a->is_ffn_fp16=false; ++ break; ++ case 'i': ++ case 'w': + break; + case 'h': + default: @@ -1227,13 +1557,14 @@ index 0000000..2b12bd5 + opt_a.ffn_hidden_size = -1; + opt_a.eps1 = 1e-6f; + opt_a.eps2 = 1e-6f; ++ opt_a.eps3 = 1e-6f; + opt_a.post_layernorm_residual = true; + opt_a.is_remove_padding = false; + opt_a.model_name = ""; + opt_a.compute_type = "fp32"; + opt_a.w_compute_type = "fp32"; + opt_a.s_compute_type = "fp32"; -+ ++ opt_a.is_ffn_fp16 = false; + + if (read_args(argc, argv, &opt_a)) { + bool c_type_fp32 = (opt_a.compute_type.compare("fp32") == 0); @@ -1545,75 +1876,64 @@ index 0000000..2b12bd5 + cublas_wrapper.setFP32GemmConfig(); + } + } -+ -+ if (opt_a->model_name != "transformer_encoder_layer") { -+ DecriptorTest desc; -+ Init(opt_a, desc, stream, &cublas_wrapper, &allocator); ++ if(opt_a->model_name == "transformer_decoder_layer" || opt_a->model_name == "transformer_decoder_layer_t5") { ++ DecriptorDecoderLayer desc; ++ InitD(opt_a, desc, stream, &cublas_wrapper, &cublas_handle, &allocator); + int res = ReadTensors(desc.input_tensors, std::string("input"), opt_a); + FT_CHECK(!res); + res = ReadTensors(desc.input_python_tensors, std::string("input"), opt_a); + FT_CHECK(!res); -+ + res = ReadTensors(desc.output_tensors, std::string("output"), opt_a, false); + FT_CHECK(!res); -+ + res = ReadTensors(desc.output_python_tensors, std::string("output"), opt_a); + FT_CHECK(!res); -+ + res = ReadTensors(desc.w_tensors, std::string("weight"), opt_a); + FT_CHECK(!res); -+ -+ std::cout << "inputs size not encoder: " << CalcTensorsSize(desc.input_tensors) << std::endl; -+ std::cout << "weights size not encoder: " << CalcTensorsSize(desc.w_tensors) << std::endl; -+ std::cout << "ouputs size not encoder: " << CalcTensorsSize(desc.output_tensors) << std::endl; -+ -+ AttentionWeight attn_weights; -+ InitWeight(opt_a, attn_weights, desc.w_tensors); -+ -+ // test for BE !! -+ desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); -+ ++ DecoderLayerWeight decoder_weights; ++ InitWeightDecoder(opt_a, decoder_weights, desc.w_tensors); ++ // // test for BE !! ++ desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights); + + CompareOutput(desc.output_python_tensors, desc.output_tensors); -+ -+// #define DO_TIME -+// #ifdef DO_TIME -+// // warmup -+// for (int i = 0; i < 10; i++) { -+// desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); -+// } -+// // profile time -+// const int ite = 1000; -+// CudaTimer cuda_timer(stream); -+// cuda_timer.start(); ++#define DO_TIME ++#ifdef DO_TIME ++ // warmup ++ for (int i = 0; i < 10; i++) { ++ // desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights); ++ } ++ // profile time ++ const int ite = 1000; ++ CudaTimer cuda_timer(stream); ++ cuda_timer.start(); + -+// for (int i = 0; i < ite; i++) { -+// for (int i = 0; i < desc.input_tensors.size(); i++) { -+// int size = desc.input_tensors[i].size(); -+// cudaH2Dcpy(const_cast(reinterpret_cast(desc.input_tensors[i].data)), -+// const_cast(reinterpret_cast(desc.input_python_tensors[i].data)), -+// size); -+// } ++ for (int i = 0; i < ite; i++) { ++ // for (int i = 0; i < desc.input_tensors.size(); i++) { ++ // int size = desc.input_tensors[i].size(); ++ // cudaH2Dcpy(const_cast(reinterpret_cast(desc.input_tensors[i].data)), ++ // const_cast(reinterpret_cast(desc.input_python_tensors[i].data)), ++ // size); ++ // } + -+// desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); -+// for (int i = 0; i < desc.output_tensors.size(); i++) { -+// int size = desc.output_tensors[i].size(); -+// cudaD2Hcpy(const_cast(reinterpret_cast(desc.output_python_tensors[i].data)), -+// const_cast(reinterpret_cast(desc.output_tensors[i].data)), -+// size); -+// } -+// } -+// float total_time = cuda_timer.stop(); -+// printf("batch_size %ld seq_len %ld layer %ld " -+// "AVG FT-CPP-time %.2f ms (%d iterations) " -+// "Total Time %.2f ms\n", -+// opt_a->batch_size, -+// opt_a->seq_len, -+// opt_a->num_layers, -+// total_time / ite, -+// ite, -+// total_time); -+// #endif ++ // desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights); ++ // for (int i = 0; i < desc.output_tensors.size(); i++) { ++ // int size = desc.output_tensors[i].size(); ++ // cudaD2Hcpy(const_cast(reinterpret_cast(desc.output_python_tensors[i].data)), ++ // const_cast(reinterpret_cast(desc.output_tensors[i].data)), ++ // size); ++ // } ++ } ++ float total_time = cuda_timer.stop(); ++ ++ printf("batch_size %ld seq_len %ld layer %ld " ++ "AVG FT-CPP-time %.2f ms (%d iterations) " ++ "Total Time %.2f ms\n", ++ opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->num_layers, ++ total_time / ite, ++ ite, ++ total_time); ++#endif + +#ifdef SPARSITY_ENABLED + cusparseLtDestroy(&cusparselt_handle); @@ -1624,8 +1944,10 @@ index 0000000..2b12bd5 + FreeDesc(desc.input_tensors); + FreeDesc(desc.output_python_tensors); + FreeDesc(desc.w_tensors); ++ return 0; + } -+ else { ++ else if (opt_a->model_name == "transformer_encoder_layer"|| opt_a->model_name == "transformer_encoder_layer_t5") { ++ + DecriptorEncoderLayer desc; + InitE(opt_a, desc, stream, &cublas_wrapper, &cublas_handle, &allocator); + int res = ReadTensors(desc.input_tensors, std::string("input"), opt_a); @@ -1697,6 +2019,85 @@ index 0000000..2b12bd5 + FreeDesc(desc.output_python_tensors); + FreeDesc(desc.w_tensors); + } ++ else { ++ DecriptorTest desc; ++ Init(opt_a, desc, stream, &cublas_wrapper, &allocator); ++ int res = ReadTensors(desc.input_tensors, std::string("input"), opt_a); ++ FT_CHECK(!res); ++ res = ReadTensors(desc.input_python_tensors, std::string("input"), opt_a); ++ FT_CHECK(!res); ++ ++ res = ReadTensors(desc.output_tensors, std::string("output"), opt_a, false); ++ FT_CHECK(!res); ++ ++ res = ReadTensors(desc.output_python_tensors, std::string("output"), opt_a); ++ FT_CHECK(!res); ++ ++ res = ReadTensors(desc.w_tensors, std::string("weight"), opt_a); ++ FT_CHECK(!res); ++ ++ std::cout << "inputs size not encoder: " << CalcTensorsSize(desc.input_tensors) << std::endl; ++ std::cout << "weights size not encoder: " << CalcTensorsSize(desc.w_tensors) << std::endl; ++ std::cout << "ouputs size not encoder: " << CalcTensorsSize(desc.output_tensors) << std::endl; ++ ++ AttentionLayerWeight attn_weights; ++ InitWeight(opt_a, attn_weights, desc.w_tensors); ++ ++ // test for BE !! ++ desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); ++ ++ ++ CompareOutput(desc.output_python_tensors, desc.output_tensors); ++ ++// #define DO_TIME ++// #ifdef DO_TIME ++// // warmup ++// for (int i = 0; i < 10; i++) { ++// desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); ++// } ++// // profile time ++// const int ite = 1000; ++// CudaTimer cuda_timer(stream); ++// cuda_timer.start(); ++ ++// for (int i = 0; i < ite; i++) { ++// for (int i = 0; i < desc.input_tensors.size(); i++) { ++// int size = desc.input_tensors[i].size(); ++// cudaH2Dcpy(const_cast(reinterpret_cast(desc.input_tensors[i].data)), ++// const_cast(reinterpret_cast(desc.input_python_tensors[i].data)), ++// size); ++// } ++ ++// desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); ++// for (int i = 0; i < desc.output_tensors.size(); i++) { ++// int size = desc.output_tensors[i].size(); ++// cudaD2Hcpy(const_cast(reinterpret_cast(desc.output_python_tensors[i].data)), ++// const_cast(reinterpret_cast(desc.output_tensors[i].data)), ++// size); ++// } ++// } ++// float total_time = cuda_timer.stop(); ++// printf("batch_size %ld seq_len %ld layer %ld " ++// "AVG FT-CPP-time %.2f ms (%d iterations) " ++// "Total Time %.2f ms\n", ++// opt_a->batch_size, ++// opt_a->seq_len, ++// opt_a->num_layers, ++// total_time / ite, ++// ite, ++// total_time); ++// #endif ++ ++#ifdef SPARSITY_ENABLED ++ cusparseLtDestroy(&cusparselt_handle); ++#endif ++ delete cublas_algo_map; ++ delete cublas_wrapper_mutex; ++ FreeDesc(desc.output_tensors); ++ FreeDesc(desc.input_tensors); ++ FreeDesc(desc.output_python_tensors); ++ FreeDesc(desc.w_tensors); ++ } + return 0; +} diff --git a/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer b/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer @@ -1806,10 +2207,10 @@ index 7ff8e0f..e1be64c 100644 template void invokeAddBias(float* out, const float* bias, const int m, const int n, cudaStream_t stream); diff --git a/src/fastertransformer/kernels/add_residual_kernels.cu b/src/fastertransformer/kernels/add_residual_kernels.cu -index 4cd9f0f..1bf2be3 100644 +index 4cd9f0f..42c9216 100644 --- a/src/fastertransformer/kernels/add_residual_kernels.cu +++ b/src/fastertransformer/kernels/add_residual_kernels.cu -@@ -29,6 +29,18 @@ __global__ void addBiasResidual(T* output, const T* input, const T* bias, const +@@ -29,6 +29,30 @@ __global__ void addBiasResidual(T* output, const T* input, const T* bias, const } } @@ -1824,11 +2225,23 @@ index 4cd9f0f..1bf2be3 100644 + (S)((T)output[blockIdx.x * n + col_index] + (T)input[blockIdx.x * n + col_index] + bias_val); + } +} ++ ++template ++__global__ void addBiasResidualSameTypeCast(U* output, const U* input, T* out, const T* bias, const int m, const int n) ++{ ++ S *out_cast = (S*)out; ++ const int col_index = blockIdx.y * blockDim.x + threadIdx.x; ++ if (col_index < n) { ++ T bias_val = (bias == nullptr) ? (T)(0.0f) : bias[col_index]; ++ out_cast[blockIdx.x * n + col_index] = ++ (S)((T)output[blockIdx.x * n + col_index] + (T)input[blockIdx.x * n + col_index] + bias_val); ++ } ++} + template void invokeAddBiasResidual(T* output, const T* input, const T* bias, const int m, const int n, cudaStream_t stream) { -@@ -38,6 +50,20 @@ void invokeAddBiasResidual(T* output, const T* input, const T* bias, const int m +@@ -38,6 +62,31 @@ void invokeAddBiasResidual(T* output, const T* input, const T* bias, const int m addBiasResidual<<>>(output, input, bias, m, n); } @@ -1841,15 +2254,26 @@ index 4cd9f0f..1bf2be3 100644 + addBiasResidualCast<<>>(output, input, out, bias, m, n); +} + ++template ++void invokeAddBiasResidualSameTypeCast(U* output, const U* input, T* out, const T* bias, const int m, const int n, cudaStream_t stream) ++{ ++ int blocks_per_row = ceil(float(n) / 1024); ++ dim3 grid(m, blocks_per_row); ++ dim3 block(min(n, 1024)); ++ addBiasResidualSameTypeCast<<>>(output, input, out, bias, m, n); ++} ++ +template void invokeAddBiasResidualCast(half* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); +template void invokeAddBiasResidualCast(float* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); +template void invokeAddBiasResidualCast(float* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); +template void invokeAddBiasResidualCast(half* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); ++ ++template void invokeAddBiasResidualSameTypeCast(half* output, const half* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); + template __global__ void addBiasAttentionFfnResidual(T* block_output, const T* ffn_output, -@@ -88,11 +114,9 @@ void invokeAddBiasAttentionFfnResidual(T* block_output, +@@ -88,11 +137,9 @@ void invokeAddBiasAttentionFfnResidual(T* block_output, } } @@ -1864,7 +2288,7 @@ index 4cd9f0f..1bf2be3 100644 #ifdef ENABLE_BF16 template void invokeAddBiasResidual(__nv_bfloat16* output, diff --git a/src/fastertransformer/kernels/add_residual_kernels.h b/src/fastertransformer/kernels/add_residual_kernels.h -index edd8179..7ab8eb4 100644 +index edd8179..afa5a77 100644 --- a/src/fastertransformer/kernels/add_residual_kernels.h +++ b/src/fastertransformer/kernels/add_residual_kernels.h @@ -27,6 +27,9 @@ namespace fastertransformer { @@ -1877,12 +2301,15 @@ index edd8179..7ab8eb4 100644 template void invokeT5AddResidual(T* output, const T* input, const int m, const int n, cudaStream_t stream); -@@ -65,4 +68,8 @@ void invokeAddBiasResidualCol32(T* output, +@@ -65,4 +68,11 @@ void invokeAddBiasResidualCol32(T* output, const float* input1_amax_ptr, const int scale_is_vector = 0); +template +void invokeAddBiasResidualCast(U* output, const T* input, T* out, const T* bias, const int m, const int n, cudaStream_t stream); ++ ++template ++void invokeAddBiasResidualSameTypeCast(U* output, const U* input, T* out, const T* bias, const int m, const int n, cudaStream_t stream); + } // namespace fastertransformer + @@ -5489,14 +5916,14 @@ index be8b178..e9b4310 100644 + } // namespace fastertransformer diff --git a/src/fastertransformer/layers/CMakeLists.txt b/src/fastertransformer/layers/CMakeLists.txt -index cbaf4fa..00a46d4 100644 +index cbaf4fa..49779bf 100644 --- a/src/fastertransformer/layers/CMakeLists.txt +++ b/src/fastertransformer/layers/CMakeLists.txt @@ -14,6 +14,7 @@ cmake_minimum_required(VERSION 3.8) -+add_subdirectory(encoder_layers) ++add_subdirectory(ms_layers) add_subdirectory(attention_layers) add_subdirectory(attention_layers_int8) add_subdirectory(xlnet_attention_layers) @@ -5551,7 +5978,7 @@ index b21e3a7..746cb71 100644 cublasMMWrapper* cublas_wrapper, IAllocator* allocator, diff --git a/src/fastertransformer/layers/attention_layers/CMakeLists.txt b/src/fastertransformer/layers/attention_layers/CMakeLists.txt -index 9cef315..f9c9cde 100644 +index 9cef315..7170af4 100644 --- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt +++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt @@ -42,8 +42,8 @@ target_link_libraries(DecoderSelfAttentionLayer PUBLIC -lcublas -lcudart cublasM @@ -5560,7 +5987,7 @@ index 9cef315..f9c9cde 100644 set_property(TARGET GptContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels) - -+target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels activation_kernels EncoderLayer) ++target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels activation_kernels) +if(EXAMPLES) add_library(TensorParallelDecoderSelfAttentionLayer STATIC TensorParallelDecoderSelfAttentionLayer.cc) set_property(TARGET TensorParallelDecoderSelfAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) @@ -5576,7 +6003,7 @@ index 9cef315..f9c9cde 100644 diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc old mode 100644 new mode 100755 -index bada640..3dca224 +index bada640..2415ac2 --- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc @@ -16,10 +16,39 @@ @@ -5645,159 +6072,157 @@ index bada640..3dca224 sync_check_cuda_error(); T scalar = 1 / sqrtf(size_per_head_ * 1.0f); invokeMaskedSoftMax(qk_buf_, -@@ -428,4 +456,148 @@ template class GptContextAttentionLayer; +@@ -428,4 +456,146 @@ template class GptContextAttentionLayer; template class GptContextAttentionLayer<__nv_bfloat16>; #endif +// HAIM Playground MS-MHA + -+template -+MSMHALayer::MSMHALayer(size_t max_batch_size, -+ size_t max_src_seq_len, -+ size_t max_tgt_seq_len, -+ size_t head_num, -+ size_t size_per_head, -+ cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ IAllocator* allocator, -+ bool is_free_buffer_after_forward, -+ bool is_qk_buf_float, -+ bool is_cross, -+ bool sparse, -+ bool is_position_bias): -+ BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse) -+{ -+cublasHandle_t cublas_handle; -+ cublasCreate(&cublas_handle); -+ cublasSetStream(cublas_handle, stream); -+ -+ params_.batch_size = max_batch_size; -+ params_.src_seq_len = max_src_seq_len; -+ params_.tgt_seq_len = max_tgt_seq_len; -+ params_.head_num = head_num; -+ params_.head_size = size_per_head; -+ params_.hidden_size = head_num * size_per_head; -+ params_.cublas_handle = cublas_handle; -+ params_.stream = stream; -+ // ctrls -+ params_.in_idx = 0; -+ params_.qkv_bias = !is_position_bias; -+ params_.projection_bias = !is_position_bias; -+ params_.is_cross = is_cross; -+ params_.position_bias = is_position_bias; -+ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; -+} -+ -+template -+void MSMHALayer::allocateBuffer() -+{ -+ if (buf_ == nullptr) { -+ size_t buff_size = GetAttnWorkspaceSize(¶ms_); -+ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, buff_size, true)); -+ } -+} -+ -+template -+void MSMHALayer::forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const AttentionWeight* attention_weights) -+{ -+ // input_tensors: use 1 gemm -- multi head attention -+ // input_query [batch_size * seq_len, hidden_dimension] -+ // attention_mask [batch_size, 1, seq_len, seq_len] -+ -+ // input_tensors: use 2 gemm -- cross attention -+ // input_query [batch_size * seq_len, hidden_dimension] -+ // enc_output [batch_size * tgt_len, hidden_dimension] -+ // attention_mask [batch_size, 1, seq_len, seq_len] ++// template ++// MSMHALayer::MSMHALayer(size_t max_batch_size, ++// size_t max_src_seq_len, ++// size_t max_tgt_seq_len, ++// size_t head_num, ++// size_t size_per_head, ++// cudaStream_t stream, ++// cublasMMWrapper* cublas_wrapper, ++// IAllocator* allocator, ++// bool is_free_buffer_after_forward, ++// bool is_qk_buf_float, ++// bool is_cross, ++// bool sparse, ++// bool is_position_bias): ++// BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse) ++// { ++// cublasHandle_t cublas_handle; ++// cublasCreate(&cublas_handle); ++// cublasSetStream(cublas_handle, stream); ++ ++// // params_.batch_size = max_batch_size; ++// // params_.src_seq_len = max_src_seq_len; ++// // params_.tgt_seq_len = max_tgt_seq_len; ++// // params_.head_num = head_num; ++// // params_.head_size = size_per_head; ++// // params_.hidden_size = head_num * size_per_head; ++// // params_.cublas_handle = cublas_handle; ++// // params_.stream = stream; ++// // // ctrls ++// // params_.in_idx = 0; ++// // params_.qkv_bias = !is_position_bias; ++// // params_.projection_bias = !is_position_bias; ++// // params_.is_cross = is_cross; ++// // params_.position_bias = is_position_bias; ++// // params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++// } ++// template ++// void MSMHALayer::allocateBuffer() ++// { ++// if (buf_ == nullptr) { ++// // size_t buff_size = GetAttnWorkspaceSize(¶ms_); ++// // buf_ = reinterpret_cast(allocator_->reMalloc(buf_, buff_size, true)); ++// } ++// } ++// template ++// void MSMHALayer::forward(std::vector* output_tensors, ++// const std::vector* input_tensors, ++// const AttentionWeight* attention_weights) ++// { ++// // input_tensors: use 1 gemm -- multi head attention ++// // input_query [batch_size * seq_len, hidden_dimension] ++// // attention_mask [batch_size, 1, seq_len, seq_len] ++ ++// // input_tensors: use 2 gemm -- cross attention ++// // input_query [batch_size * seq_len, hidden_dimension] ++// // enc_output [batch_size * tgt_len, hidden_dimension] ++// // attention_mask [batch_size, 1, seq_len, seq_len] ++ ++// // output_tensors: ++// // attention_out [batch_size * seq_len, hidden_dimension] ++// // key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x] ++// // value_cache [batch, local_head_num, max_seq_len, size_per_head] ++ ++// int in_tensor_number = input_tensors->size(); ++// allocateBuffer(); // only once ++// // if (params_.position_bias) ++// // if (params_.is_cross) { ++// // void* outputs[] = {(void*)output_tensors->at(0).data}; ++// // void* inputs[] = {(void*)input_tensors->at(0).data, ++// // (void*)input_tensors->at(1).data, ++// // (void*)attention_weights->query_weight.kernel, ++// // (void*)attention_weights->key_weight.kernel, ++// // (void*)input_tensors->at(2).data, ++// // (void*)input_tensors->at(3).data, ++// // (void*)attention_weights->attention_output_weight.kernel}; ++// // forward_attn((T**)inputs, 7, (T**)outputs, 1, ¶ms_, (void*)buf_); ++// // } ++// // else { ++// // void* outputs[] = {(void*)output_tensors->at(0).data}; ++// // void* inputs[] = { ++// // (void*)input_tensors->at(0).data, ++// // (void*)attention_weights->query_weight.kernel, ++// // (void*)input_tensors->at(1).data, ++// // (void*)input_tensors->at(2).data, ++// // (void*)attention_weights->attention_output_weight.kernel ++// // }; ++// // forward_attn((T**)inputs, 5, (T**)outputs, 1, ¶ms_, (void*)buf_); ++// // } ++// // else { ++// // if (params_.is_cross) { ++// // void* outputs[] = {(void*)output_tensors->at(0).data}; ++// // void* inputs[] = {(void*)input_tensors->at(0).data, ++// // (void*)input_tensors->at(1).data, ++// // (void*)attention_weights->query_weight.kernel, ++// // (void*)attention_weights->key_weight.kernel, ++// // (void*)attention_weights->query_weight.bias, ++// // (void*)input_tensors->at(2).data, ++// // (void*)attention_weights->attention_output_weight.kernel, ++// // (void*)attention_weights->attention_output_weight.bias ++// // }; ++// // forward_attn((T**)inputs, 8, (T**)outputs, 1, ¶ms_, (void*)buf_); ++// // } ++// // else { ++// // void* outputs[] = {(void*)output_tensors->at(0).data}; ++// // void* inputs[] = {(void*)input_tensors->at(0).data, ++// // (void*)attention_weights->query_weight.kernel, ++// // (void*)attention_weights->query_weight.bias, ++// // (void*)input_tensors->at(1).data, ++// // (void*)attention_weights->attention_output_weight.kernel, ++// // (void*)attention_weights->attention_output_weight.bias}; ++// // forward_attn((T**)inputs, 6, (T**)outputs, 1, ¶ms_, (void*)buf_); ++// // } ++// } ++ ++ ++// // template ++// // MSMHALayer::~MSMHALayer() ++// // { ++// // // cublas_wrapper_ = nullptr; ++// // freeBuffer(); ++// // } ++ ++// template ++// void MSMHALayer::freeBuffer() ++// { ++// if (buf_ != nullptr) { ++// allocator_->free(buf_); ++// buf_ = nullptr; ++// } ++// } + -+ // output_tensors: -+ // attention_out [batch_size * seq_len, hidden_dimension] -+ // key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x] -+ // value_cache [batch, local_head_num, max_seq_len, size_per_head] -+ -+ int in_tensor_number = input_tensors->size(); -+ allocateBuffer(); // only once -+ if (params_.position_bias) -+ if (params_.is_cross) { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)input_tensors->at(1).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)attention_weights->key_weight.kernel, -+ (void*)input_tensors->at(2).data, -+ (void*)input_tensors->at(3).data, -+ (void*)attention_weights->attention_output_weight.kernel}; -+ -+ forward_attn((T**)inputs, 7, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } -+ else { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = { -+ (void*)input_tensors->at(0).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)input_tensors->at(1).data, -+ (void*)input_tensors->at(2).data, -+ (void*)attention_weights->attention_output_weight.kernel -+ }; -+ forward_attn((T**)inputs, 5, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } -+ else { -+ if (params_.is_cross) { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)input_tensors->at(1).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)attention_weights->key_weight.kernel, -+ (void*)attention_weights->query_weight.bias, -+ (void*)input_tensors->at(2).data, -+ (void*)attention_weights->attention_output_weight.kernel, -+ (void*)attention_weights->attention_output_weight.bias -+ }; -+ forward_attn((T**)inputs, 8, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } else { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)attention_weights->query_weight.bias, -+ (void*)input_tensors->at(1).data, -+ (void*)attention_weights->attention_output_weight.kernel, -+ (void*)attention_weights->attention_output_weight.bias}; -+ forward_attn((T**)inputs, 6, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } -+ } -+} -+ -+template -+MSMHALayer::~MSMHALayer() -+{ -+ cublas_wrapper_ = nullptr; -+ freeBuffer(); -+} -+ -+template -+void MSMHALayer::freeBuffer() -+{ -+ if (buf_ != nullptr) { -+ allocator_->free(buf_); -+ buf_ = nullptr; -+ } -+} -+ -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; + } // namespace fastertransformer diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h old mode 100644 new mode 100755 -index 92e2175..f7fa5ca +index 92e2175..39c49c0 --- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h @@ -18,7 +18,7 @@ @@ -5805,7 +6230,7 @@ index 92e2175..f7fa5ca #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" - -+#include "src/fastertransformer/layers/encoder_layers/encoder.h" ++// #include "src/fastertransformer/layers/encoder_layers/encoder.h" namespace fastertransformer { template @@ -5815,132 +6240,132 @@ index 92e2175..f7fa5ca + +// TODO(haim): Add template according to "mix" compute type (fp32, fp16) -+template -+class MSMHALayer: public BaseAttentionLayer { -+private: -+ void allocateBuffer() override; -+ void freeBuffer() override; -+ -+ using BaseAttentionLayer::is_free_buffer_after_forward_; -+ using BaseAttentionLayer::is_allocate_buffer_; -+ using BaseAttentionLayer::cublas_wrapper_; -+ using BaseAttentionLayer::allocator_; -+ -+protected: -+ using BaseAttentionLayer::stream_; -+ using BaseAttentionLayer::sparse_; -+ T* buf_ = nullptr; -+ encoderParamT params_; -+ -+public: -+ MSMHALayer(size_t batch_size, -+ size_t src_seq_len, -+ size_t tgt_seq_len, -+ size_t head_num, -+ size_t size_per_head, -+ cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ IAllocator* allocator, -+ bool is_free_buffer_after_forward, -+ bool is_qk_buf_float, -+ bool is_cross, -+ bool sparse = false, -+ bool is_position_bias=false); -+ MSMHALayer(MSMHALayer const& attention_layer); -+ virtual ~MSMHALayer(); -+ void forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const AttentionWeight* attention_weights) override; -+}; ++// template ++// class MSMHALayer: public BaseAttentionLayer { ++// private: ++// void allocateBuffer() override; ++// void freeBuffer() override; ++ ++// using BaseAttentionLayer::is_free_buffer_after_forward_; ++// using BaseAttentionLayer::is_allocate_buffer_; ++// using BaseAttentionLayer::cublas_wrapper_; ++// using BaseAttentionLayer::allocator_; ++ ++// protected: ++// using BaseAttentionLayer::stream_; ++// using BaseAttentionLayer::sparse_; ++// T* buf_ = nullptr; ++// // encoderParamT params_; ++ ++// public: ++// MSMHALayer(size_t batch_size, ++// size_t src_seq_len, ++// size_t tgt_seq_len, ++// size_t head_num, ++// size_t size_per_head, ++// cudaStream_t stream, ++// cublasMMWrapper* cublas_wrapper, ++// IAllocator* allocator, ++// bool is_free_buffer_after_forward, ++// bool is_qk_buf_float, ++// bool is_cross, ++// bool sparse = false, ++// bool is_position_bias=false); ++// MSMHALayer(MSMHALayer const& attention_layer); ++// virtual ~MSMHALayer(); ++// void forward(std::vector* output_tensors, ++// const std::vector* input_tensors, ++// const AttentionWeight* attention_weights) override; ++// }; + } // namespace fastertransformer -diff --git a/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h +diff --git a/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h b/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h new file mode 100644 -index 0000000..3b43391 +index 0000000..0a60835 --- /dev/null -+++ b/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h ++++ b/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h @@ -0,0 +1,76 @@ -+/* -+ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+ */ -+ -+#pragma once -+ -+#include -+#include -+ -+#include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" -+#include "src/fastertransformer/layers/BaseLayer.h" -+#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h" -+#include "src/fastertransformer/utils/Tensor.h" -+#include "src/fastertransformer/utils/allocator.h" -+#include "src/fastertransformer/utils/cublasMMWrapper.h" -+#include "src/fastertransformer/utils/memory_utils.h" -+ -+namespace fastertransformer { -+ -+enum class EncoderLayerType { -+ UNFUSED_ENCODER_LAYER, -+ FUSED_ENCODER_LAYER -+}; -+ -+template -+EncoderLayerType getEncoderLayerType(size_t size_per_head, const int sm, const bool remove_padding, -+ const int max_seq_len, const bool is_fuse = true) { -+ if (std::is_same::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72) -+ && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) { -+ return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER; -+ } else { -+ return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER; -+ } -+} -+ -+template -+EncoderLayerType getEncoderLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, -+ const int max_seq_len, const int int8_mode) { -+ if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64 -+ && max_seq_len <= 384) { -+ return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER; -+ } else { -+ return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER; -+ } -+} ++// /* ++// * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++// * ++// * Licensed under the Apache License, Version 2.0 (the "License"); ++// * you may not use this file except in compliance with the License. ++// * You may obtain a copy of the License at ++// * ++// * http://www.apache.org/licenses/LICENSE-2.0 ++// * ++// * Unless required by applicable law or agreed to in writing, software ++// * distributed under the License is distributed on an "AS IS" BASIS, ++// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++// * See the License for the specific language governing permissions and ++// * limitations under the License. ++// */ ++ ++// #pragma once ++ ++// #include ++// #include ++ ++// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" ++// #include "src/fastertransformer/layers/BaseLayer.h" ++// #include "src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h" ++// #include "src/fastertransformer/utils/Tensor.h" ++// #include "src/fastertransformer/utils/allocator.h" ++// #include "src/fastertransformer/utils/cublasMMWrapper.h" ++// #include "src/fastertransformer/utils/memory_utils.h" ++ ++// namespace fastertransformer { ++ ++// enum class DecoderLayerType { ++// UNFUSED_DECODER_LAYER, ++// FUSED_DECODER_LAYER ++// }; ++ ++// template ++// DecoderLayerType getDecoderLayerType(size_t size_per_head, const int sm, const bool remove_padding, ++// const int max_seq_len, const bool is_fuse = true) { ++// if (std::is_same::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72) ++// && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) { ++// return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++// } else { ++// return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++// } ++// } + -+template -+class BaseEncoderLayer: public BaseLayer { ++// template ++// DecoderLayerType getDecoderLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, ++// const int max_seq_len, const int int8_mode) { ++// if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64 ++// && max_seq_len <= 384) { ++// return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++// } else { ++// return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++// } ++// } + -+public: -+ virtual void forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const EncoderLayerWeight* encoder_layer_weights) = 0; -+ BaseEncoderLayer(cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ IAllocator* allocator, -+ bool is_free_buffer_after_forward, -+ bool sparse = false): -+ BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse) -+ { -+ } -+ virtual ~BaseEncoderLayer() = default; -+}; -+} // namespace fastertransformer -diff --git a/src/fastertransformer/layers/encoder_layers/CMakeLists.txt b/src/fastertransformer/layers/encoder_layers/CMakeLists.txt ++// template ++// class BaseDecoderLayer: public BaseLayer { ++ ++// public: ++// virtual void forward(std::vector* output_tensors, ++// const std::vector* input_tensors, ++// const DecoderLayerWeight* decoder_layer_weights) = 0; ++// BaseDecoderLayer(cudaStream_t stream, ++// cublasMMWrapper* cublas_wrapper, ++// IAllocator* allocator, ++// bool is_free_buffer_after_forward, ++// bool sparse = false): ++// BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse) ++// { ++// } ++// virtual ~BaseDecoderLayer() = default; ++// }; ++// } // namespace fastertransformer +diff --git a/src/fastertransformer/layers/decoder_layers/CMakeLists.txt b/src/fastertransformer/layers/decoder_layers/CMakeLists.txt new file mode 100644 -index 0000000..1a3af85 +index 0000000..e343db9 --- /dev/null -+++ b/src/fastertransformer/layers/encoder_layers/CMakeLists.txt ++++ b/src/fastertransformer/layers/decoder_layers/CMakeLists.txt @@ -0,0 +1,21 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +# @@ -5958,17 +6383,17 @@ index 0000000..1a3af85 + +cmake_minimum_required(VERSION 3.8) + -+add_library(EncoderLayer STATIC encoder.cc MSEncoderLayer.cc) -+set_property(TARGET EncoderLayer PROPERTY POSITION_INDEPENDENT_CODE ON) -+set_property(TARGET EncoderLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -+target_link_libraries(EncoderLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels ++add_library(DecoderLayer STATIC decoder.cc MSDecoderLayer.cc) ++set_property(TARGET DecoderLayer PROPERTY POSITION_INDEPENDENT_CODE ON) ++set_property(TARGET DecoderLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) ++target_link_libraries(DecoderLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels + layernorm_kernels add_residual_kernels bert_preprocess_kernels) -diff --git a/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h b/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h +diff --git a/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h b/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h new file mode 100644 -index 0000000..c441b23 +index 0000000..bd31438 --- /dev/null -+++ b/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h -@@ -0,0 +1,33 @@ ++++ b/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h +@@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * @@ -5991,23 +6416,27 @@ index 0000000..c441b23 +#include "src/fastertransformer/kernels/layernorm_kernels.h" +namespace fastertransformer { + -+template -+struct EncoderLayerWeight { -+ DenseWeight qkv_weight; -+ DenseWeight attention_layer_output_weight; -+ DenseWeight encoder_output_mapping; -+ DenseWeight encoder_output_projection; -+ LayerNormWeight layernorm1; -+ LayerNormWeight layernorm2; -+}; ++// template ++// struct DecoderLayerWeight { ++// DenseWeight attention_qkv_weight; ++// DenseWeight attention_layer_output_weight; ++// DenseWeight attention_cross_q_weight; ++// DenseWeight attention_cross_kv_weight; ++// DenseWeight attention_cross_layer_output_weight; ++// DenseWeight decoder_output_mapping; ++// DenseWeight decoder_output_projection; ++// LayerNormWeight layernorm1; ++// LayerNormWeight layernorm2; ++// LayerNormWeight layernorm3; ++// }; + +} // namespace fastertransformer -diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc +diff --git a/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc new file mode 100644 -index 0000000..a3442da +index 0000000..ae8875d --- /dev/null -+++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc -@@ -0,0 +1,164 @@ ++++ b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc +@@ -0,0 +1,208 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. @@ -6025,7 +6454,7 @@ index 0000000..a3442da + * limitations under the License. + */ + -+#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h" ++#include "src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h" +#include "src/fastertransformer/kernels/activation_kernels.h" + +namespace fastertransformer { @@ -6050,7 +6479,7 @@ index 0000000..a3442da + free(input_host); +} +template -+MSELayer::MSELayer(size_t max_batch_size, ++MSDLayer::MSDLayer(size_t max_batch_size, + size_t max_src_seq_len, + size_t max_tgt_seq_len, + size_t head_num, @@ -6058,7 +6487,11 @@ index 0000000..a3442da + size_t ffn_hidden_size, + float eps1, + float eps2, ++ float eps3, + bool post_layernorm, ++ bool position_bias1, ++ bool position_bias2, ++ bool is_ffn_fp16, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + cublasHandle_t* cublas_handle, @@ -6067,7 +6500,7 @@ index 0000000..a3442da + bool is_qk_buf_float, + bool sparse): + -+ BaseEncoderLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr) ++ BaseDecoderLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr) +{ + params_.batch_size = max_batch_size; + params_.src_seq_len = max_src_seq_len; @@ -6078,31 +6511,60 @@ index 0000000..a3442da + params_.ffn_hidden_size = ffn_hidden_size; + params_.eps1 = eps1; + params_.eps2 = eps2; ++ params_.eps3 = eps3; + params_.layernorm_post = post_layernorm; + // handle + params_.cublas_handle = *cublas_handle; + params_.stream = stream; -+ params_.ffn_fp16 = true; ++ params_.ffn_fp16 = is_ffn_fp16; + // ctrls + params_.in_idx = 0; -+ params_.qkv_bias = true; -+ params_.projection_bias = true; -+ params_.is_cross = false; -+ params_.position_bias = false; + params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.projection_bias = true; ++ ++ params_.attn1.in_idx = 0; ++ params_.attn1.batch_size = max_batch_size; ++ params_.attn1.src_seq_len = max_src_seq_len; ++ params_.attn1.tgt_seq_len = max_tgt_seq_len; ++ params_.attn1.head_num = head_num; ++ params_.attn1.head_size = size_per_head; ++ params_.attn1.hidden_size = head_num * size_per_head; ++ params_.attn1.qkv_bias = true; ++ params_.attn1.projection_bias = false; ++ params_.attn1.is_cross = false; ++ params_.attn1.position_bias = false; ++ params_.attn1.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn1.cublas_handle = *cublas_handle; ++ params_.attn1.stream = stream; ++ ++ params_.attn2.in_idx = 0; ++ params_.attn2.batch_size = max_batch_size; ++ params_.attn2.src_seq_len = max_src_seq_len; ++ params_.attn2.tgt_seq_len = max_tgt_seq_len; ++ params_.attn2.head_num = head_num; ++ params_.attn2.head_size = size_per_head; ++ params_.attn2.hidden_size = head_num * size_per_head; ++ params_.attn2.qkv_bias = true; ++ params_.attn2.projection_bias = false; ++ params_.attn2.is_cross = true; ++ params_.attn2.position_bias = false; ++ params_.attn2.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn2.cublas_handle = *cublas_handle; ++ params_.attn2.stream = stream; +} + +template -+void MSELayer::allocateBuffer() ++void MSDLayer::allocateBuffer() +{ + if (buf_ == nullptr) { -+ size_t buff_size = GetEncoderLayerWorkspaceSize(¶ms_); -+ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, sizeof(T) * buff_size, true)); ++ size_t buff_size = GetDecoderLayerWorkspaceSize(¶ms_); ++ std::cout<<"buff_size: "<(allocator_->reMalloc(buf_, buff_size, true)); + } +} + +template -+void MSELayer::freeBuffer() ++void MSDLayer::freeBuffer() +{ + if (buf_ != nullptr) { + allocator_->free(buf_); @@ -6111,181 +6573,200 @@ index 0000000..a3442da +} + +template -+MSELayer::~MSELayer() ++MSDLayer::~MSDLayer() +{ + cublas_wrapper_ = nullptr; + freeBuffer(); +} + +template -+void MSELayer::forward(std::vector* output_tensors, ++void MSDLayer::forward(std::vector* output_tensors, + const std::vector* input_tensors, -+ const EncoderLayerWeight* encoder_weights) -+{ ++ const DecoderLayerWeight* decoder_weights) ++{ ++ std::cout<<"forward\n"; + allocateBuffer(); // only once + void* outputs[] = {(void*)output_tensors->at(0).data}; -+ if (!params_.layernorm_post) { -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)encoder_weights->layernorm1.gamma, -+ (void*)encoder_weights->layernorm1.beta, -+ (void*)encoder_weights->qkv_weight.kernel, -+ (void*)encoder_weights->qkv_weight.bias, -+ (void*)input_tensors->at(1).data, -+ (void*)encoder_weights->attention_layer_output_weight.kernel, -+ (void*)encoder_weights->attention_layer_output_weight.bias, -+ (void*)encoder_weights->layernorm2.gamma, -+ (void*)encoder_weights->layernorm2.beta, -+ (void*)encoder_weights->encoder_output_mapping.kernel, -+ (void*)encoder_weights->encoder_output_mapping.bias, -+ (void*)encoder_weights->encoder_output_projection.kernel, -+ (void*)encoder_weights->encoder_output_projection.bias}; -+ forwardEncoder(inputs, 14, outputs, 1, ¶ms_, buf_); -+ } -+ else { ++ // std::cout<qkv_bias<< params_.attn2->qkv_bias<< !params_.attn1->position_bias<< !params_.attn2->position_bias<at(0).data, -+ (void*)encoder_weights->qkv_weight.kernel, -+ (void*)encoder_weights->qkv_weight.bias, ++ (void*)decoder_weights->layernorm1.gamma, ++ (void*)decoder_weights->layernorm1.beta, ++ (void*)decoder_weights->attention_qkv_weight.kernel, ++ (void*)decoder_weights->attention_qkv_weight.bias, + (void*)input_tensors->at(1).data, -+ (void*)encoder_weights->attention_layer_output_weight.kernel, -+ (void*)encoder_weights->attention_layer_output_weight.bias, -+ (void*)encoder_weights->layernorm1.gamma, -+ (void*)encoder_weights->layernorm1.beta, -+ (void*)encoder_weights->encoder_output_mapping.kernel, -+ (void*)encoder_weights->encoder_output_mapping.bias, -+ (void*)encoder_weights->encoder_output_projection.kernel, -+ (void*)encoder_weights->encoder_output_projection.bias, -+ (void*)encoder_weights->layernorm2.gamma, -+ (void*)encoder_weights->layernorm2.beta}; -+ forwardEncoder(inputs, 3, outputs, 1, ¶ms_, buf_); ++ (void*)decoder_weights->attention_layer_output_weight.kernel, ++ (void*)decoder_weights->attention_layer_output_weight.bias, ++ (void*)decoder_weights->layernorm2.gamma, ++ (void*)decoder_weights->layernorm2.beta, ++ (void*)input_tensors->at(2).data, ++ (void*)decoder_weights->attention_cross_q_weight.kernel, ++ (void*)decoder_weights->attention_cross_kv_weight.kernel, ++ (void*)decoder_weights->attention_cross_q_weight.bias, ++ (void*)input_tensors->at(3).data, ++ (void*)decoder_weights->attention_cross_layer_output_weight.kernel, ++ (void*)decoder_weights->attention_cross_layer_output_weight.bias, ++ (void*)decoder_weights->layernorm3.gamma, ++ (void*)decoder_weights->layernorm3.beta, ++ (void*)decoder_weights->decoder_output_mapping.kernel, ++ (void*)decoder_weights->decoder_output_mapping.bias, ++ (void*)decoder_weights->decoder_output_projection.kernel, ++ (void*)decoder_weights->decoder_output_projection.bias}; ++ forwardDecoder(inputs, 23, outputs, 1, ¶ms_, buf_); ++ // } ++ // else { ++ // void* inputs[] = {(void*)input_tensors->at(0).data, ++ // (void*)decoder_weights->qkv_weight.kernel, ++ // (void*)decoder_weights->qkv_weight.bias, ++ // (void*)input_tensors->at(1).data, ++ // (void*)decoder_weights->attention_layer_output_weight.kernel, ++ // (void*)decoder_weights->attention_layer_output_weight.bias, ++ // (void*)decoder_weights->layernorm1.gamma, ++ // (void*)decoder_weights->layernorm1.beta, ++ // (void*)decoder_weights->decoder_output_mapping.kernel, ++ // (void*)decoder_weights->decoder_output_mapping.bias, ++ // (void*)decoder_weights->decoder_output_projection.kernel, ++ // (void*)decoder_weights->decoder_output_projection.bias, ++ // (void*)decoder_weights->layernorm2.gamma, ++ // (void*)decoder_weights->layernorm2.beta}; ++ // forwardDecoder(inputs, 3, outputs, 1, ¶ms_, buf_); ++ // } + } -+ + return; +} + -+template class MSELayer; -+template class MSELayer; -+template class MSELayer; -+template class MSELayer; -+template class MSELayer; -+template class MSELayer; -+template class MSELayer; -+template class MSELayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; + +} // namespace fastertransformer -diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h +diff --git a/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h new file mode 100644 -index 0000000..afc6a5a +index 0000000..8908141 --- /dev/null -+++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h -@@ -0,0 +1,69 @@ -+/* -+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -+ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+ */ ++++ b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h +@@ -0,0 +1,74 @@ ++// /* ++// * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++// * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++// * ++// * Licensed under the Apache License, Version 2.0 (the "License"); ++// * you may not use this file except in compliance with the License. ++// * You may obtain a copy of the License at ++// * ++// * http://www.apache.org/licenses/LICENSE-2.0 ++// * ++// * Unless required by applicable law or agreed to in writing, software ++// * distributed under the License is distributed on an "AS IS" BASIS, ++// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++// * See the License for the specific language governing permissions and ++// * limitations under the License. ++// */ ++ ++// #pragma once ++ ++// #include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h" ++// #include "src/fastertransformer/layers/decoder_layers/decoder.h" ++ ++// namespace fastertransformer { ++ ++// // TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++// template ++// class MSDLayer: public BaseDecoderLayer { ++// private: ++// mutable decoderParamT params_; ++ ++// void allocateBuffer() override; ++// void freeBuffer() override; ++// void* buf_; ++// using BaseDecoderLayer::is_free_buffer_after_forward_; ++// using BaseDecoderLayer::is_allocate_buffer_; ++// using BaseDecoderLayer::cublas_wrapper_; ++// using BaseDecoderLayer::allocator_; ++ ++// protected: ++// using BaseDecoderLayer::stream_; ++// using BaseDecoderLayer::sparse_; ++ ++// public: ++// MSDLayer(size_t max_batch_size, ++// size_t max_src_seq_len, ++// size_t max_tgt_seq_len, ++// size_t head_num, ++// size_t size_per_head, ++// size_t ffn_hidden_size, ++// float eps1, ++// float eps2, ++// float eps3, ++// bool post_layernorm, ++// bool position_bias1, ++// bool position_bias2, ++// bool is_ffn_fp16, ++// cudaStream_t stream, ++// cublasMMWrapper* cublas_wrapper, ++// cublasHandle_t* cublas_handle, ++// IAllocator* allocator, ++// bool is_free_buffer_after_forward, ++// bool is_qk_buf_float, ++// bool sparse); ++ ++// MSDLayer(MSDLayer const& decoder_layer); ++ ++// virtual ~MSDLayer(); ++ ++// void forward(std::vector* output_tensors, ++// const std::vector* input_tensors, ++// const DecoderLayerWeight* decoder_weights) override; ++// }; ++ ++// } // namespace fastertransformer +diff --git a/src/fastertransformer/layers/decoder_layers/decoder.cc b/src/fastertransformer/layers/decoder_layers/decoder.cc +new file mode 100644 +index 0000000..bb5c615 +--- /dev/null ++++ b/src/fastertransformer/layers/decoder_layers/decoder.cc +@@ -0,0 +1,421 @@ + -+#pragma once ++#include "src/fastertransformer/layers/decoder_layers/decoder.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include "src/fastertransformer/layers/encoder_layers/encoder.h" + -+#include "src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h" -+#include "src/fastertransformer/layers/encoder_layers/encoder.h" -+ -+namespace fastertransformer { -+ -+// TODO(haim): Add template according to "mix" compute type (fp32, fp16) -+template -+class MSELayer: public BaseEncoderLayer { -+private: -+ encoderParamT params_; -+ void allocateBuffer() override; -+ void freeBuffer() override; -+ void* buf_; -+ using BaseEncoderLayer::is_free_buffer_after_forward_; -+ using BaseEncoderLayer::is_allocate_buffer_; -+ using BaseEncoderLayer::cublas_wrapper_; -+ using BaseEncoderLayer::allocator_; -+ -+protected: -+ using BaseEncoderLayer::stream_; -+ using BaseEncoderLayer::sparse_; -+ -+public: -+ MSELayer(size_t max_batch_size, -+ size_t max_src_seq_len, -+ size_t max_tgt_seq_len, -+ size_t head_num, -+ size_t size_per_head, -+ size_t ffn_hidden_size, -+ float eps1, -+ float eps2, -+ bool post_layernorm, -+ cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ cublasHandle_t* cublas_handle, -+ IAllocator* allocator, -+ bool is_free_buffer_after_forward, -+ bool is_qk_buf_float, -+ bool sparse); -+ -+ MSELayer(MSELayer const& encoder_layer); -+ -+ virtual ~MSELayer(); -+ -+ void forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const EncoderLayerWeight* encoder_weights) override; -+}; -+ -+} // namespace fastertransformer -diff --git a/src/fastertransformer/layers/encoder_layers/encoder.cc b/src/fastertransformer/layers/encoder_layers/encoder.cc -new file mode 100644 -index 0000000..004718e ---- /dev/null -+++ b/src/fastertransformer/layers/encoder_layers/encoder.cc -@@ -0,0 +1,814 @@ -+ -+#include "src/fastertransformer/layers/encoder_layers/encoder.h" -+#include "src/fastertransformer/kernels/activation_kernels.h" -+#include "src/fastertransformer/kernels/add_residual_kernels.h" -+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" -+#include "src/fastertransformer/kernels/layernorm_kernels.h" -+#include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include -+ +namespace fastertransformer { + +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -+#define ALIGN(x, y) (UP_DIV(x, y) * (y)) ++// #define UP_DIV(x, y) (x) +#define ALIGN_SIZE 16 + +template -+void printTensor(const std::string& str, T* input, int size) ++void printTensor(char* str, T* input, int size) +{ -+ std::cout << str; ++ printf("%s ", str); + T* input_device = input; -+ auto input_host = std::make_unique(size); -+ cudaD2Hcpy(input_host.get(), input_device, size); -+ for (int k = 0, index = 0; k < size; k++) { -+ if (index != 0) -+ std::cout << ','; -+ std::cout << input_host[k]; -+ index++; -+ if (index == 10) { ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) + std::cout << std::endl; -+ index = 0; -+ } + } ++ + std::cout << std::endl; ++ ++ free(input_host); +} + +template @@ -6295,107 +6776,1078 @@ index 0000000..004718e + << " size is " << size; + T* input_device = input; + T* input_host = (T*)malloc(size * sizeof(T)); -+ cudaD2Hcpy(input_host, input_device, size); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ + for (int k = 0; k < (int)size; k++) { + if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { + std::cout << "found NAN or INF"; + break; + } + } ++ + std::cout << std::endl; + free(input_host); +} ++ ++template ++size_t GetAttnWorkspaceSize(decoderParamT* param) ++{ ++ size_t size_q = UP_DIV((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_k = UP_DIV((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_v = size_k; ++ size_t qkv_len = size_q + size_k + size_v; ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ size_t attn_out_size = ++ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T); ++} ++ ++template size_t GetAttnWorkspaceSize(decoderParamT* param); ++template size_t GetAttnWorkspaceSize(decoderParamT* param); +template -+T checksum(const T* tensor, int size) ++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param) +{ -+ if constexpr (std::is_floating_point()) { -+ auto tensor_host = std::make_unique(size); -+ double sum = 0.; -+ T* ptr = tensor_host.get(); -+ cudaD2Hcpy(ptr, tensor, size); -+ for (int i = 0; i < size; i++) { -+ // sum += (double)ptr[i]*i; -+ sum += ptr[i]; -+ } -+ return static_cast(sum); ++ size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ ; ++ size_t attn2_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ ; ++ ++ size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); ++ size_t out_size = (param->layernorm_post) ? attn_out + attn2_out : attn_out * 2 + attn2_out * 2; ++ return (std::max(GetAttnWorkspaceSize(param) * 2, ffn_size * sizeof(T)) + out_size * sizeof(T) ++ + GetAttnWorkspaceSize(param)*2); ++} ++ ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++template ++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws) ++{ ++ size_t inter_size = param->ffn_hidden_size; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; ++ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; ++ if ((std::is_same::value) || (std::is_same::value)) { ++ gemm_data_types[0] = CUDA_R_16F; ++ gemm_data_types[1] = CUDA_R_16F; ++ gemm_data_types[2] = CUDA_R_16F; + } -+ else -+ return static_cast(0.f); ++ S alpha = 1.0f; ++ S beta = 0.0f; ++ ++ int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size}; ++ int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size}; ++ T* normed_attn_out = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(inputs[param->in_idx++], ++ normed_attn_out, ++ ws, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ invokeAddBiasGelu(reinterpret_cast(ws), ++ reinterpret_cast(inputs[param->in_idx++]), ++ h_token_num, ++ inter_size, ++ param->stream); ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = h_token_num; ++ gemm_dims[2] = inter_size; ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = inter_size; ++ gemm_lds[2] = param->hidden_size; ++ fastertransformer::CublasGemmWrapper(inputs[param->in_idx++], ++ ws, ++ output[0], ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); +} + +template -+T checksumGrid(const T* tensor, const encoderParamT* param, bool zp = false, bool cross = false, bool ffn = false) ++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws) +{ -+ if constexpr (std::is_floating_point()) { -+ int hidden_size; -+ if (ffn) { -+ hidden_size = param->ffn_hidden_size; ++ param->in_idx = 0; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ T* attn_out = reinterpret_cast(ws); ++ T* normed_from_tensor = reinterpret_cast(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* attn_ws = reinterpret_cast(normed_from_tensor) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn_out = normed_from_tensor; ++ T* attn2_out = reinterpret_cast(attn_ws) + GetAttnWorkspaceSize(param); ++ T* normed_from_tensor2 = reinterpret_cast(attn2_out) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* attn2_ws = reinterpret_cast(normed_from_tensor2) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn2_out = normed_from_tensor2; ++ T* ffn_ws = normed_attn2_out + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* tmp_out = reinterpret_cast(output[0]); ++ if (std::is_same::value && param->ffn_fp16 == true) { ++ tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ } ++ T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta1 = reinterpret_cast(inputs[param->in_idx++]); ++ invokeGeneralLayerNorm(normed_from_tensor, ++ reinterpret_cast(from_tensor), // from tensor ++ gamma1, // Gamma ++ beta1, // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ inputs[--param->in_idx] = normed_from_tensor; ++ // if attention is embedded inside an decoder - fuse the bias to next layer normalization ++ int in_idx = param->in_idx; ++ forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws); ++ param->in_idx = param->attn1.in_idx + in_idx; ++ if (param->projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta2 = reinterpret_cast(inputs[param->in_idx++]); ++ if (param->layernorm_post == false) { ++ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, ++ normed_attn_out, ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); + } + else { -+ hidden_size = param->hidden_size; -+ } -+ const int size = param->batch_size * param->src_seq_len * hidden_size; -+ int head_size = hidden_size / param->head_num; -+ auto tensor_host = std::make_unique(size); -+ double sum = 0.; -+ T* ptr = tensor_host.get(); -+ try { -+ cudaD2Hcpy(ptr, tensor, size); -+ } -+ catch (...) { -+ std::cout << "copy tensor failed" << std::endl; -+ return static_cast(0.f); -+ } -+ bool compressed = param->eft && zp; -+ if (!compressed) { -+ if (cross) { -+ std::cout << "cross sum:" << std::endl; -+ for (int i = 0; i < param->batch_size; i++) { -+ for (int j = 0; j < param->head_num; j++) { -+ for (int k = 0; k < param->src_seq_len / 2; k++) { -+ for (int l = 0; l < head_size; l++) { -+ sum += ptr[(((i * param->head_num) + j) * param->src_seq_len + k) * head_size + l]; -+ } -+ } -+ } -+ } -+ } -+ else { -+ std::cout << "grid sum:" << std::endl; -+ for (int i = 0; i < param->batch_size; i++) { -+ for (int j = 0; j < param->src_seq_len / 2; j++) { -+ for (int k = 0; k < hidden_size; k++) { -+ sum += ptr[((i * param->src_seq_len) + j) * hidden_size + k]; -+ } -+ } -+ } -+ } ++ } ++ inputs[--param->in_idx] = normed_attn_out; ++ in_idx = param->in_idx; ++ forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws); ++ param->in_idx = param->attn2.in_idx + in_idx; ++ if (param->projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma3 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta3 = reinterpret_cast(inputs[param->in_idx++]); ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeGeneralAddBiasResidualPreLayerNorm(attn2_out, ++ normed_attn2_out, ++ attn_out, ++ gamma3, // gamma ++ beta3, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps3); ++ ++ } else { ++ invokeGeneralAddBiasResidualPreLayerNormCast(attn2_out, ++ reinterpret_cast(normed_attn2_out), ++ attn_out, ++ gamma3, // gamma ++ beta3, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps3); + } -+ else { -+ std::cout << "compress sum:" << std::endl; -+ for (int i = 0; i < param->h_token_num * hidden_size; i++) { -+ sum += ptr[i]; -+ } ++ } else { ++ // without projection bias ++ } ++ inputs[--param->in_idx] = normed_attn2_out; ++ if (param->ffn_fp16 == false) { ++ forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } else { ++ forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } ++ attn2_out = param->layernorm_post ? normed_attn2_out : attn2_out; ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidual(reinterpret_cast(tmp_out), ++ attn2_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } else { ++ if(param->layernorm_post){ ++ invokeAddBiasResidualSameTypeCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } else{ ++ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); + } -+ return static_cast(sum); ++ qkv_buf, ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); + } + else { -+ return static_cast(0.f); ++ T* weight_qkv = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(weight_qkv, ++ from_tensor, ++ qkv_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ fastertransformer::invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), ++ static_cast(output1), ++ static_cast(output2), ++ static_cast(qkv_buf), ++ bias_qkv, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ 0, ++ param->stream); + } -+} -+ -+template -+void saveTensor(const std::string& name, T* tensor, int size) -+{ -+ auto tensor_host = std::make_unique(size); -+ T* ptr = tensor_host.get(); -+ cudaD2Hcpy(ptr, tensor, size); -+ std::ofstream wf(name + ".bin", std::ofstream::out | std::ofstream::binary); -+ wf.write(reinterpret_cast(ptr), size * sizeof(T)); -+ wf.close(); -+} ++ gemm_ops[0] = CUBLAS_OP_T; + -+void CublasGemmWrapper(const void* a_addr, ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->head_size; ++ gemm_lds[2] = param->tgt_seq_len; ++ ++ int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->tgt_seq_len)}; ++ ++ gemm_dims[0] = param->tgt_seq_len; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->head_size; ++ ++ fastertransformer::CublasGemmStridedBatchedWrapper(output1, ++ q_buf_2, ++ qk_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ ++ T* attention_mask = reinterpret_cast(inputs[param->in_idx++]); ++ T* position_bias = nullptr; ++ if (param->position_bias) { ++ position_bias = reinterpret_cast(inputs[param->in_idx++]); ++ } ++ T scalar = static_cast(1.0f / sqrtf(param->head_size * 1.0f)); ++ fastertransformer::invokeMixMaskedSoftMax(static_cast(qk_buf), ++ attention_mask, ++ position_bias, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ scalar, ++ param->stream); ++ ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->head_size; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->tgt_seq_len; ++ ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->tgt_seq_len; ++ gemm_lds[2] = param->head_size; ++ ++ gemm_strides[0] = param->tgt_seq_len * param->head_size; ++ gemm_strides[1] = param->src_seq_len * param->tgt_seq_len; ++ gemm_strides[2] = param->src_seq_len * param->head_size; ++ fastertransformer::CublasGemmStridedBatchedWrapper(output2, ++ qk_buf, ++ qkv_buf_2, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ invokeTransposeQKV(static_cast(qkv_buf_3), ++ static_cast(qkv_buf_2), ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); ++ ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->src_seq_len; ++ gemm_dims[2] = param->hidden_size; ++ ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = param->hidden_size; ++ gemm_lds[2] = param->hidden_size; ++ ++ fastertransformer::CublasGemmWrapper(reinterpret_cast(inputs[param->in_idx++]), ++ qkv_buf_3, ++ static_cast(output[0]), ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ if (param->projection_bias) { ++ int len = param->batch_size * param->src_seq_len; ++ invokeAddBias( ++ static_cast(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream); ++ } ++ return; ++} ++ ++template void ++forward_attn(float* inputs[], int in_len, float* output[], int out_len, attentionParamT* param, void* ws); ++template void ++forward_attn(half* inputs[], int in_len, half* output[], int out_len, attentionParamT* param, void* ws); ++ ++template void ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws); ++template void ++forward_ffn(half* inputs[], int in_len, half* output[], int out_len, ParamT* param, void* ws); ++template void ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/decoder_layers/decoder.h b/src/fastertransformer/layers/decoder_layers/decoder.h +new file mode 100644 +index 0000000..c302ea8 +--- /dev/null ++++ b/src/fastertransformer/layers/decoder_layers/decoder.h +@@ -0,0 +1,112 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h" ++#include ++#include ++ ++namespace fastertransformer { ++ ++// typedef struct { ++// size_t batch_size; ++// size_t src_seq_len; ++// size_t tgt_seq_len; ++// size_t head_num; ++// size_t head_size; ++// size_t hidden_size; ++// size_t h_token_num; ++// // handle ++// cublasHandle_t cublas_handle; ++// cudaStream_t stream; ++// cublasGemmAlgo_t algo; ++// // ctrls ++// int in_idx; ++// bool qkv_bias; // ture ++// bool projection_bias; // ture ++// bool is_cross; // false ++// bool position_bias; ++// int *padding_offset; ++// } attentionParamT; ++ ++// typedef struct { ++// size_t batch_size; ++// size_t src_seq_len; ++// size_t tgt_seq_len; ++// size_t head_num; ++// size_t head_size; ++// size_t hidden_size; ++// size_t h_token_num; ++// size_t ffn_hidden_size; // 4 * param->hidden_size; ++// bool ffn_fp16; ++// float eps1; ++// float eps2; ++// float eps3; ++// // handle ++// cublasHandle_t cublas_handle; ++// cudaStream_t stream; ++// cublasGemmAlgo_t algo; ++// // ctrls ++// bool projection_bias; // ture ++ ++// int in_idx; ++// mutable attentionParamT attn1; ++// mutable attentionParamT attn2; ++// bool layernorm_post; ++// int *padding_offset; ++// } decoderParamT; ++// typedef struct{ ++// public: ++// size_t batch_size; ++// size_t src_seq_len; ++// size_t tgt_seq_len; ++// size_t head_num; ++// size_t head_size; ++// size_t hidden_size; ++// size_t h_token_num; ++// size_t ffn_hidden_size; ++// // handle ++// cublasHandle_t cublas_handle; ++// cudaStream_t stream; ++// cublasGemmAlgo_t algo; ++// // ctrls ++// int *padding_offset; ++// int in_idx; ++ ++// } ParamT; ++ ++// typedef struct : ParamT{ ++ ++// // ctrls ++// bool qkv_bias; // ture ++// bool projection_bias; // ture ++// bool is_cross; // false ++// bool position_bias; ++// int *padding_offset; ++// } attentionParamT; ++ ++// typedef struct : ParamT{ ++ ++// bool ffn_fp16; ++// float eps1; ++// float eps2; ++// float eps3; ++ ++// bool projection_bias; // ture ++ ++// mutable attentionParamT attn1; ++// mutable attentionParamT attn2; ++// bool layernorm_post; ++// int *padding_offset; ++// } decoderParamT; ++// template ++// size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++// template ++// size_t GetAttnWorkspaceSize(decoderParamT* param); ++// template ++// void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws); ++// template ++// void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++// void forwardDecoder(std::vector > const* ++// inputs); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h +new file mode 100644 +index 0000000..3b43391 +--- /dev/null ++++ b/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h +@@ -0,0 +1,76 @@ ++/* ++ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include ++#include ++ ++#include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" ++#include "src/fastertransformer/layers/BaseLayer.h" ++#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h" ++#include "src/fastertransformer/utils/Tensor.h" ++#include "src/fastertransformer/utils/allocator.h" ++#include "src/fastertransformer/utils/cublasMMWrapper.h" ++#include "src/fastertransformer/utils/memory_utils.h" ++ ++namespace fastertransformer { ++ ++enum class EncoderLayerType { ++ UNFUSED_ENCODER_LAYER, ++ FUSED_ENCODER_LAYER ++}; ++ ++template ++EncoderLayerType getEncoderLayerType(size_t size_per_head, const int sm, const bool remove_padding, ++ const int max_seq_len, const bool is_fuse = true) { ++ if (std::is_same::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72) ++ && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) { ++ return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER; ++ } else { ++ return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER; ++ } ++} ++ ++template ++EncoderLayerType getEncoderLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, ++ const int max_seq_len, const int int8_mode) { ++ if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64 ++ && max_seq_len <= 384) { ++ return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER; ++ } else { ++ return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER; ++ } ++} ++ ++template ++class BaseEncoderLayer: public BaseLayer { ++ ++public: ++ virtual void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const EncoderLayerWeight* encoder_layer_weights) = 0; ++ BaseEncoderLayer(cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool sparse = false): ++ BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse) ++ { ++ } ++ virtual ~BaseEncoderLayer() = default; ++}; ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/encoder_layers/CMakeLists.txt b/src/fastertransformer/layers/encoder_layers/CMakeLists.txt +new file mode 100644 +index 0000000..1a3af85 +--- /dev/null ++++ b/src/fastertransformer/layers/encoder_layers/CMakeLists.txt +@@ -0,0 +1,21 @@ ++# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++ ++cmake_minimum_required(VERSION 3.8) ++ ++add_library(EncoderLayer STATIC encoder.cc MSEncoderLayer.cc) ++set_property(TARGET EncoderLayer PROPERTY POSITION_INDEPENDENT_CODE ON) ++set_property(TARGET EncoderLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) ++target_link_libraries(EncoderLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels ++ layernorm_kernels add_residual_kernels bert_preprocess_kernels) +diff --git a/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h b/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h +new file mode 100644 +index 0000000..c441b23 +--- /dev/null ++++ b/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h +@@ -0,0 +1,33 @@ ++/* ++ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/DenseWeight.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++namespace fastertransformer { ++ ++template ++struct EncoderLayerWeight { ++ DenseWeight qkv_weight; ++ DenseWeight attention_layer_output_weight; ++ DenseWeight encoder_output_mapping; ++ DenseWeight encoder_output_projection; ++ LayerNormWeight layernorm1; ++ LayerNormWeight layernorm2; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc +new file mode 100644 +index 0000000..4075695 +--- /dev/null ++++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc +@@ -0,0 +1,198 @@ ++/* ++ * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++ ++namespace fastertransformer { ++template ++void printTensor(char* str, T* input, int size) ++{ ++ printf("%s ", str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++template ++MSELayer::MSELayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ bool post_layernorm, ++ bool position_bias, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse): ++ ++ BaseEncoderLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr) ++{ ++ params_.batch_size = max_batch_size; ++ params_.src_seq_len = max_src_seq_len; ++ params_.tgt_seq_len = max_tgt_seq_len; ++ params_.head_num = head_num; ++ params_.head_size = size_per_head; ++ params_.hidden_size = head_num * size_per_head; ++ params_.ffn_hidden_size = ffn_hidden_size; ++ params_.eps1 = eps1; ++ params_.eps2 = eps2; ++ params_.layernorm_post = post_layernorm; ++ // handle ++ params_.cublas_handle = *cublas_handle; ++ params_.stream = stream; ++ params_.ffn_fp16 = is_ffn_fp16; ++ // ctrls ++ params_.in_idx = 0; ++ params_.position_bias = position_bias; ++ params_.qkv_bias = !params_.position_bias; ++ params_.projection_bias = !params_.position_bias; ++ params_.is_cross = false; ++ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++} ++ ++template ++void MSELayer::allocateBuffer() ++{ ++ if (buf_ == nullptr) { ++ size_t buff_size = GetEncoderLayerWorkspaceSize(¶ms_); ++ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, sizeof(T) * buff_size, true)); ++ } ++} ++ ++template ++void MSELayer::freeBuffer() ++{ ++ if (buf_ != nullptr) { ++ allocator_->free(buf_); ++ buf_ = nullptr; ++ } ++} ++ ++template ++MSELayer::~MSELayer() ++{ ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++} ++ ++template ++void MSELayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const EncoderLayerWeight* encoder_weights) ++{ ++ allocateBuffer(); // only once ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ if (!params_.layernorm_post) { ++ if (params_.position_bias) { ++ void* inputs[] = { ++ (void*)input_tensors->at(0).data, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->qkv_weight.kernel, ++ (void*)input_tensors->at(1).data, ++ (void*)encoder_weights->attention_layer_output_weight.kernel, ++ (void*)encoder_weights->layernorm2.gamma, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_projection.kernel, ++ (void*)input_tensors->at(2).data, ++ }; ++ forwardEncoder(inputs, 9, outputs, 1, ¶ms_, buf_); ++ } ++ else { ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->layernorm1.beta, ++ (void*)encoder_weights->qkv_weight.kernel, ++ (void*)encoder_weights->qkv_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)encoder_weights->attention_layer_output_weight.kernel, ++ (void*)encoder_weights->attention_layer_output_weight.bias, ++ (void*)encoder_weights->layernorm2.gamma, ++ (void*)encoder_weights->layernorm2.beta, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_mapping.bias, ++ (void*)encoder_weights->encoder_output_projection.kernel, ++ (void*)encoder_weights->encoder_output_projection.bias}; ++ forwardEncoder(inputs, 14, outputs, 1, ¶ms_, buf_); ++ } ++ } ++ else { ++ if (params_.position_bias) { ++ void* inputs[] = { ++ (void*)input_tensors->at(0).data, ++ (void*)encoder_weights->qkv_weight.kernel, ++ (void*)input_tensors->at(1).data, ++ (void*)encoder_weights->attention_layer_output_weight.kernel, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_projection.kernel, ++ (void*)encoder_weights->layernorm2.gamma, ++ (void*)input_tensors->at(2).data, ++ }; ++ forwardEncoder(inputs, 9, outputs, 1, ¶ms_, buf_); ++ } ++ else { ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)encoder_weights->qkv_weight.kernel, ++ (void*)encoder_weights->qkv_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)encoder_weights->attention_layer_output_weight.kernel, ++ (void*)encoder_weights->attention_layer_output_weight.bias, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->layernorm1.beta, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_mapping.bias, ++ (void*)encoder_weights->encoder_output_projection.kernel, ++ (void*)encoder_weights->encoder_output_projection.bias, ++ (void*)encoder_weights->layernorm2.gamma, ++ (void*)encoder_weights->layernorm2.beta}; ++ forwardEncoder(inputs, 14, outputs, 1, ¶ms_, buf_); ++ } ++ } ++ ++ return; ++} ++ ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h +new file mode 100644 +index 0000000..33de2ba +--- /dev/null ++++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h +@@ -0,0 +1,71 @@ ++/* ++ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h" ++#include "src/fastertransformer/layers/encoder_layers/encoder.h" ++ ++namespace fastertransformer { ++ ++// TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++template ++class MSELayer: public BaseEncoderLayer { ++private: ++ encoderParamT params_; ++ void allocateBuffer() override; ++ void freeBuffer() override; ++ void* buf_; ++ using BaseEncoderLayer::is_free_buffer_after_forward_; ++ using BaseEncoderLayer::is_allocate_buffer_; ++ using BaseEncoderLayer::cublas_wrapper_; ++ using BaseEncoderLayer::allocator_; ++ ++protected: ++ using BaseEncoderLayer::stream_; ++ using BaseEncoderLayer::sparse_; ++ ++public: ++ MSELayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ bool post_layernorm, ++ bool position_bias, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse); ++ ++ MSELayer(MSELayer const& encoder_layer); ++ ++ virtual ~MSELayer(); ++ ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const EncoderLayerWeight* encoder_weights) override; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/encoder_layers/encoder.cc b/src/fastertransformer/layers/encoder_layers/encoder.cc +new file mode 100644 +index 0000000..c0b4f37 +--- /dev/null ++++ b/src/fastertransformer/layers/encoder_layers/encoder.cc +@@ -0,0 +1,815 @@ ++ ++#include "src/fastertransformer/layers/encoder_layers/encoder.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include ++ ++namespace fastertransformer { ++ ++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) ++#define ALIGN(x, y) (UP_DIV(x, y) * (y)) ++#define ALIGN_SIZE 16 ++ ++template ++void printTensor(const std::string& str, T* input, int size) ++{ ++ std::cout << str; ++ T* input_device = input; ++ auto input_host = std::make_unique(size); ++ cudaD2Hcpy(input_host.get(), input_device, size); ++ for (int k = 0, index = 0; k < size; k++) { ++ if (index != 0) ++ std::cout << ','; ++ std::cout << input_host[k]; ++ index++; ++ if (index == 10) { ++ std::cout << std::endl; ++ index = 0; ++ } ++ } ++ std::cout << std::endl; ++} ++ ++template ++void isNan(char* str, T* input, int size) ++{ ++ std::cout << str << " " ++ << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ cudaD2Hcpy(input_host, input_device, size); ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } ++ } ++ std::cout << std::endl; ++ free(input_host); ++} ++template ++T checksum(const T* tensor, int size) ++{ ++ if constexpr (std::is_floating_point()) { ++ auto tensor_host = std::make_unique(size); ++ double sum = 0.; ++ T* ptr = tensor_host.get(); ++ cudaD2Hcpy(ptr, tensor, size); ++ for (int i = 0; i < size; i++) { ++ // sum += (double)ptr[i]*i; ++ sum += ptr[i]; ++ } ++ return static_cast(sum); ++ } ++ else ++ return static_cast(0.f); ++} ++ ++template ++T checksumGrid(const T* tensor, const encoderParamT* param, bool zp = false, bool cross = false, bool ffn = false) ++{ ++ if constexpr (std::is_floating_point()) { ++ int hidden_size; ++ if (ffn) { ++ hidden_size = param->ffn_hidden_size; ++ } ++ else { ++ hidden_size = param->hidden_size; ++ } ++ const int size = param->batch_size * param->src_seq_len * hidden_size; ++ int head_size = hidden_size / param->head_num; ++ auto tensor_host = std::make_unique(size); ++ double sum = 0.; ++ T* ptr = tensor_host.get(); ++ try { ++ cudaD2Hcpy(ptr, tensor, size); ++ } ++ catch (...) { ++ std::cout << "copy tensor failed" << std::endl; ++ return static_cast(0.f); ++ } ++ bool compressed = param->eft && zp; ++ if (!compressed) { ++ if (cross) { ++ std::cout << "cross sum:" << std::endl; ++ for (int i = 0; i < param->batch_size; i++) { ++ for (int j = 0; j < param->head_num; j++) { ++ for (int k = 0; k < param->src_seq_len / 2; k++) { ++ for (int l = 0; l < head_size; l++) { ++ sum += ptr[(((i * param->head_num) + j) * param->src_seq_len + k) * head_size + l]; ++ } ++ } ++ } ++ } ++ } ++ else { ++ std::cout << "grid sum:" << std::endl; ++ for (int i = 0; i < param->batch_size; i++) { ++ for (int j = 0; j < param->src_seq_len / 2; j++) { ++ for (int k = 0; k < hidden_size; k++) { ++ sum += ptr[((i * param->src_seq_len) + j) * hidden_size + k]; ++ } ++ } ++ } ++ } ++ } ++ else { ++ std::cout << "compress sum:" << std::endl; ++ for (int i = 0; i < param->h_token_num * hidden_size; i++) { ++ sum += ptr[i]; ++ } ++ } ++ return static_cast(sum); ++ } ++ else { ++ return static_cast(0.f); ++ } ++} ++ ++template ++void saveTensor(const std::string& name, T* tensor, int size) ++{ ++ auto tensor_host = std::make_unique(size); ++ T* ptr = tensor_host.get(); ++ cudaD2Hcpy(ptr, tensor, size); ++ std::ofstream wf(name + ".bin", std::ofstream::out | std::ofstream::binary); ++ wf.write(reinterpret_cast(ptr), size * sizeof(T)); ++ wf.close(); ++} ++ ++void CublasGemmWrapper(const void* a_addr, + const void* b_addr, + void* c_addr, + const int* params, @@ -6407,397 +7859,2549 @@ index 0000000..004718e + cublasHandle_t cublas_handle, + cublasGemmAlgo_t algo) +{ -+ const int m = params[0]; -+ const int n = params[1]; -+ const int k = params[2]; -+ cublasOperation_t trans_a = operations[0]; -+ cublasOperation_t trans_b = operations[1]; -+ const int lda = lds[0]; -+ const int ldb = lds[1]; -+ const int ldc = lds[2]; -+ cudaDataType type_a = data_types[0]; -+ cudaDataType type_b = data_types[1]; -+ cudaDataType type_c = data_types[2]; -+ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; -+ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { -+ compute_type = CUBLAS_COMPUTE_16F; ++ const int m = params[0]; ++ const int n = params[1]; ++ const int k = params[2]; ++ cublasOperation_t trans_a = operations[0]; ++ cublasOperation_t trans_b = operations[1]; ++ const int lda = lds[0]; ++ const int ldb = lds[1]; ++ const int ldc = lds[2]; ++ cudaDataType type_a = data_types[0]; ++ cudaDataType type_b = data_types[1]; ++ cudaDataType type_c = data_types[2]; ++ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; ++ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { ++ compute_type = CUBLAS_COMPUTE_16F; ++ } ++ cublasGemmEx(cublas_handle, ++ trans_a, ++ trans_b, ++ m, ++ n, ++ k, ++ alpha, ++ a_addr, ++ type_a, ++ lda, ++ b_addr, ++ type_b, ++ ldb, ++ beta, ++ c_addr, ++ type_c, ++ ldc, ++ compute_type, ++ algo); ++} ++ ++void CublasGemmStridedBatchedWrapper(const void* a_addr, ++ const void* b_addr, ++ void* c_addr, ++ const int* params, ++ const int* lds, ++ const cublasOperation_t* operations, ++ const int* strides, ++ const cudaDataType* data_types, ++ void* alpha, ++ void* beta, ++ int batch, ++ cublasHandle_t cublas_handle, ++ cublasGemmAlgo_t algo) ++{ ++ const int m = params[0]; ++ const int n = params[1]; ++ const int k = params[2]; ++ cublasOperation_t trans_a = operations[0]; ++ cublasOperation_t trans_b = operations[1]; ++ const int lda = lds[0]; ++ const int ldb = lds[1]; ++ const int ldc = lds[2]; ++ cudaDataType type_a = data_types[0]; ++ cudaDataType type_b = data_types[1]; ++ cudaDataType type_c = data_types[2]; ++ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; ++ // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F; ++ ++ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { ++ compute_type = CUBLAS_COMPUTE_16F; ++ } ++ const int stride_a = strides[0]; ++ const int stride_b = strides[1]; ++ const int stride_c = strides[2]; ++ cublasGemmStridedBatchedEx(cublas_handle, ++ trans_a, ++ trans_b, ++ m, ++ n, ++ k, ++ alpha, ++ a_addr, ++ type_a, ++ lda, ++ stride_a, ++ b_addr, ++ type_b, ++ ldb, ++ stride_b, ++ beta, ++ c_addr, ++ type_c, ++ ldc, ++ stride_c, ++ batch, ++ compute_type, ++ algo); ++} ++ ++template ++size_t GetAttnWorkspaceSize(encoderParamT* param) ++{ ++ size_t size_q = ALIGN((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE); ++ size_t size_k = ALIGN((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE); ++ size_t size_v = size_k; ++ size_t qkv_len = size_q + size_k + size_v; ++ size_t qk_buf_len = ++ ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE); ++ size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ size_t attn_out_size = ++ ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE); ++ return (qkv_buf_2_len + 2 * attn_out_size + std::max(qkv_len, qk_buf_len)) * sizeof(T); ++} ++ ++template size_t GetAttnWorkspaceSize(encoderParamT* param); ++template size_t GetAttnWorkspaceSize(encoderParamT* param); ++template ++size_t GetEncoderLayerWorkspaceSize(encoderParamT* param) ++{ ++ size_t max_hidden = ALIGN(std::max(param->hidden_size, param->ffn_hidden_size),ALIGN_SIZE); ++ size_t compress_buffer_len = ALIGN(param->batch_size * param->src_seq_len * max_hidden,ALIGN_SIZE); ++ size_t padding_len = ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE); ++ size_t offset_len = ALIGN(param->batch_size,ALIGN_SIZE); ++ size_t d_token_len = ALIGN(1,ALIGN_SIZE); ++ size_t eft_size = compress_buffer_len * sizeof(T) + (padding_len + offset_len) * sizeof(int) + d_token_len * sizeof(size_t); ++ size_t attn_out = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ size_t ffn = ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE); ++ return (std::max(GetAttnWorkspaceSize(param), ffn * sizeof(T)) + (attn_out * 3) * sizeof(T)) + eft_size; ++} ++ ++template size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); ++template size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); ++ ++template ++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws) ++{ ++ size_t inter_size = param->ffn_hidden_size; ++ size_t h_token_num = param->h_token_num; ++ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; ++ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; ++ if ((std::is_same::value) || (std::is_same::value)) { ++ gemm_data_types[0] = CUDA_R_16F; ++ gemm_data_types[1] = CUDA_R_16F; ++ gemm_data_types[2] = CUDA_R_16F; ++ } ++ S alpha = 1.0f; ++ S beta = 0.0f; ++ ++ int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size}; ++ int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size}; ++ T* normed_attn_out = reinterpret_cast(inputs[param->in_idx++]); ++ CublasGemmWrapper(inputs[param->in_idx++], ++ normed_attn_out, ++ ws, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ invokeAddBiasGelu(reinterpret_cast(ws), ++ reinterpret_cast(inputs[param->in_idx++]), ++ h_token_num, ++ inter_size, ++ param->stream); ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = h_token_num; ++ gemm_dims[2] = inter_size; ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = inter_size; ++ gemm_lds[2] = param->hidden_size; ++ CublasGemmWrapper(inputs[param->in_idx++], ++ ws, ++ output[0], ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++} ++ ++template ++void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws) ++{ ++ param->in_idx = 0; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ param->h_token_num = h_token_num; ++ param->padding_offset = nullptr; ++ int* d_sequence_lengths = nullptr; ++ T* input_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ T* from_tensor = input_tensor; ++ T* compress_buffer; ++ compress_buffer = reinterpret_cast(ws); ++ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(h_token_num * param->hidden_size,ALIGN_SIZE)); ++ int* padding_offset = reinterpret_cast(ws); ++ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE)); ++ d_sequence_lengths = reinterpret_cast(ws); ++ param->d_sequence_length = d_sequence_lengths; ++ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(param->batch_size,ALIGN_SIZE)); ++ size_t* d_token_num = reinterpret_cast(ws); ++ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(1,ALIGN_SIZE)); ++ invokeBuildSequnceLength( ++ from_tensor, param->batch_size, d_sequence_lengths, param->src_seq_len, param->hidden_size, param->stream); ++ // printTensor("seq_len=",d_sequence_lengths,param->batch_size); ++ invokeGetPaddingOffset(&h_token_num, ++ d_token_num, ++ padding_offset, ++ d_sequence_lengths, ++ param->batch_size, ++ param->src_seq_len, ++ param->stream); ++ // std::cout << "token=" << h_token_num << "m=" << param->batch_size * param->src_seq_len << std::endl; ++ if (h_token_num * 2 <= param->batch_size * param->src_seq_len) { ++ param->eft = true; ++ invokeRemovePadding(compress_buffer, ++ (const T*)from_tensor, ++ padding_offset, ++ h_token_num, ++ param->head_num * param->head_size, ++ param->stream); ++ param->h_token_num = h_token_num; ++ param->padding_offset = padding_offset; ++ from_tensor = compress_buffer; ++ } ++ h_token_num = param->h_token_num; ++ T* attn_out = reinterpret_cast(ws); ++ T* normed_from_tensor = ++ reinterpret_cast(ws) + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ T* attn_ws_offset = (param->layernorm_post) ? reinterpret_cast(ws) : reinterpret_cast(normed_from_tensor); ++ T* attn_ws = attn_ws_offset + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ T* normed_attn_out = normed_from_tensor; ++ T* ffn_ws = normed_attn_out + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ ++ T* tmp_out = reinterpret_cast(output[0]); ++ if (param->padding_offset != nullptr || (std::is_same::value && param->ffn_fp16 == true)) { ++ tmp_out = ffn_ws + ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE); ++ } ++ T* tmp_out1 = reinterpret_cast(output[0]); ++ T* out_buf = tmp_out; ++ if (param->padding_offset != nullptr) { ++ tmp_out1 = compress_buffer; ++ } ++ if (param->layernorm_post == false) { ++ T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta1 = reinterpret_cast(inputs[param->in_idx++]); ++ ++ invokeGeneralLayerNorm(normed_from_tensor, ++ reinterpret_cast(from_tensor), // from tensor ++ gamma1, // Gamma ++ beta1, // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ } ++ else { ++ normed_from_tensor = from_tensor; ++ } ++ inputs[--param->in_idx] = normed_from_tensor; ++ // if attention is embedded inside an encoder - fuse the bias to next layer normalization ++ bool projection_bias = param->projection_bias; ++ param->projection_bias = false; ++ int in_idx = param->in_idx; ++ forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn_out, 1, param, attn_ws); ++ param->in_idx += in_idx; ++ param->projection_bias = projection_bias; ++ if (param->projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta2 = reinterpret_cast(inputs[param->in_idx++]); ++ if (param->layernorm_post == false) { ++ if (std::is_same::value || param->ffn_fp16 == false) { ++ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, ++ normed_attn_out, ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } ++ else { ++ invokeGeneralAddBiasResidualPreLayerNormCast(attn_out, ++ reinterpret_cast(normed_attn_out), ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } ++ } ++ else { ++ if (std::is_same::value || param->ffn_fp16 == false) { ++ invokeAddBiasResidualLayerNorm(attn_out, ++ from_tensor, ++ projection_bias, ++ gamma2, // gamma ++ beta2, // beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ normed_attn_out = attn_out; ++ } ++ else { ++ invokeAddBiasResidualLayerNormCast(reinterpret_cast(attn_out), ++ reinterpret_cast(normed_attn_out), ++ reinterpret_cast(from_tensor), ++ projection_bias, ++ gamma2, // gamma ++ beta2, // beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ // isNan((char*)"LN 1 model", (half*)attn_out, h_token_num * param->hidden_size); ++ } ++ } ++ } ++ else { ++ // without projection bias ++ } ++ // forward ffn ++ // simulate attention inputs ++ inputs[--param->in_idx] = normed_attn_out; ++ if (param->ffn_fp16 == false) { ++ forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } ++ else { ++ forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } ++ if (param->layernorm_post == true) { ++ if (std::is_same::value || param->ffn_fp16 == false) { ++ invokeAddBiasResidualLayerNorm(reinterpret_cast(tmp_out), ++ attn_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, ++ reinterpret_cast(inputs[param->in_idx++]), // Gamma ++ reinterpret_cast(inputs[param->in_idx++]), // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } ++ else { ++ invokeAddBiasResidualLayerNormCast( ++ reinterpret_cast(tmp_out), ++ reinterpret_cast(tmp_out1), ++ reinterpret_cast(normed_attn_out), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, ++ reinterpret_cast(inputs[param->in_idx++]), // Gamma ++ reinterpret_cast(inputs[param->in_idx++]), // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ out_buf = tmp_out1; ++ } ++ } ++ else { ++ if (std::is_same::value || param->ffn_fp16 == false) { ++ invokeAddBiasResidual(reinterpret_cast(tmp_out), ++ attn_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } ++ else { ++ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn_out), ++ reinterpret_cast(tmp_out1), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } ++ } ++ if (param->padding_offset != nullptr) { ++ cudaMemsetAsync(output[0], ++ 0, ++ param->batch_size * param->src_seq_len * param->head_size * param->head_num * sizeof(T), ++ param->stream); ++ invokeRebuildPadding( ++ (T*)output[0], out_buf, param->padding_offset, h_token_num, param->hidden_size, param->stream); ++ } ++ return; ++} ++ ++template void ++forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); ++template void ++forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); ++ ++template ++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws) ++{ ++ param->in_idx = 0; ++ auto extra_tmp_size = ++ ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE); ++ size_t size_q = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE); ++ size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ T* q_buf_2 = (T*)ws; ++ T* output1 = static_cast(ws) + q_buf_2_len; ++ T* output2 = static_cast(output1) + extra_tmp_size; ++ T* qkv_buf = static_cast(output2) + extra_tmp_size; ++ T* qk_buf = qkv_buf; ++ T* qkv_buf_2 = q_buf_2; ++ T* qkv_buf_3 = qk_buf; ++ int gemm_dims[] = {3 * (int)param->hidden_size, (int)param->h_token_num, (int)param->hidden_size}; ++ int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size}; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; ++ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; ++ if (std::is_same::value) { ++ gemm_data_types[0] = CUDA_R_16F; ++ gemm_data_types[1] = CUDA_R_16F; ++ gemm_data_types[2] = CUDA_R_16F; ++ } ++ T alpha = 1.0f; ++ T beta = 0.0f; ++ ++ if (param->is_cross) { ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->src_seq_len; ++ gemm_dims[2] = param->hidden_size; ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = param->hidden_size; ++ gemm_lds[2] = param->hidden_size; ++ T* encoder_output = reinterpret_cast(inputs[param->in_idx++]); ++ T* weight_q = reinterpret_cast(inputs[param->in_idx++]); ++ ++ CublasGemmWrapper(weight_q, ++ from_tensor, ++ qkv_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ gemm_dims[0] = 2 * param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->tgt_seq_len; ++ gemm_lds[0] = 2 * param->hidden_size; ++ gemm_lds[2] = 2 * param->hidden_size; ++ T* weight_kv = reinterpret_cast(inputs[param->in_idx++]); ++ ++ CublasGemmWrapper(weight_kv, ++ encoder_output, ++ qkv_buf + (param->batch_size * param->src_seq_len) * param->hidden_size, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ ++ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ invokeCrossAddFusedQKVBiasTranspose(q_buf_2, ++ output1, ++ output2, ++ qkv_buf, ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); ++ } ++ else { ++ T* weight_qkv = reinterpret_cast(inputs[param->in_idx++]); ++ CublasGemmWrapper(weight_qkv, ++ from_tensor, ++ qkv_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ ++ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ if (param->padding_offset == nullptr) { ++ invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), ++ static_cast(output1), ++ static_cast(output2), ++ static_cast(qkv_buf), ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ 0, ++ param->stream); ++ } ++ else { ++ invokeAddFusedZP_QKVBiasTranspose(static_cast(q_buf_2), ++ static_cast(output1), ++ static_cast(output2), ++ static_cast(qkv_buf), ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->h_token_num, ++ param->padding_offset, ++ param->stream); ++ } ++ } ++ gemm_ops[0] = CUBLAS_OP_T; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->tgt_seq_len; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->head_size; ++ ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->head_size; ++ gemm_lds[2] = param->tgt_seq_len; ++ ++ int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->tgt_seq_len)}; ++ ++ CublasGemmStridedBatchedWrapper(output1, ++ q_buf_2, ++ qk_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ ++ T* attention_mask = reinterpret_cast(inputs[param->in_idx++]); ++ if (param->padding_offset != nullptr) ++ invokeBuildEncoderAttentionMask( ++ attention_mask, param->d_sequence_length, param->batch_size, param->src_seq_len, param->stream); ++ T* position_bias = nullptr; ++ if (param->position_bias) { ++ position_bias = reinterpret_cast(inputs[param->in_idx++]); ++ } ++ T scalar = static_cast(1.0f / sqrtf(param->head_size * 1.0f)); ++ invokeMixMaskedSoftMax(static_cast(qk_buf), ++ attention_mask, ++ position_bias, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ scalar, ++ param->stream); ++ ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->head_size; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->tgt_seq_len; ++ ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->tgt_seq_len; ++ gemm_lds[2] = param->head_size; ++ ++ gemm_strides[0] = param->tgt_seq_len * param->head_size; ++ gemm_strides[1] = param->src_seq_len * param->tgt_seq_len; ++ gemm_strides[2] = param->src_seq_len * param->head_size; ++ ++ CublasGemmStridedBatchedWrapper(output2, ++ qk_buf, ++ qkv_buf_2, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ ++ if (param->padding_offset == nullptr) { ++ invokeTransposeQKV(static_cast(qkv_buf_3), ++ static_cast(qkv_buf_2), ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); ++ } ++ else { ++ invokeTransposeAttentionOutRemovePadding(qkv_buf_2, ++ qkv_buf_3, ++ param->h_token_num, ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->padding_offset, ++ param->stream); ++ } ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = param->h_token_num; ++ gemm_dims[2] = param->hidden_size; ++ ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = param->hidden_size; ++ gemm_lds[2] = param->hidden_size; ++ CublasGemmWrapper(reinterpret_cast(inputs[param->in_idx++]), ++ qkv_buf_3, ++ static_cast(output[0]), ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ ++ if (param->projection_bias) { ++ int len = param->h_token_num; ++ invokeAddBias( ++ static_cast(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream); ++ } ++ return; ++} ++ ++template void ++forward_attn(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws); ++template void ++forward_attn(half* inputs[], int in_len, half* output[], int out_len, encoderParamT* param, void* ws); ++ ++template void ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws); ++template void ++forward_ffn(half* inputs[], int in_len, half* output[], int out_len, encoderParamT* param, void* ws); ++template void ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/encoder_layers/encoder.h b/src/fastertransformer/layers/encoder_layers/encoder.h +new file mode 100644 +index 0000000..2ae0ad3 +--- /dev/null ++++ b/src/fastertransformer/layers/encoder_layers/encoder.h +@@ -0,0 +1,50 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h" ++#include ++#include ++ ++namespace fastertransformer { ++ ++typedef struct { ++ size_t batch_size; ++ size_t src_seq_len; ++ size_t tgt_seq_len; ++ size_t head_num; ++ size_t head_size; ++ size_t hidden_size; ++ size_t h_token_num; ++ size_t ffn_hidden_size; // 4 * param->hidden_size; ++ bool ffn_fp16; ++ float eps1; ++ float eps2; ++ // handle ++ cublasHandle_t cublas_handle; ++ cudaStream_t stream; ++ cublasGemmAlgo_t algo; ++ // ctrls ++ int in_idx; ++ bool qkv_bias; // true ++ bool projection_bias; // true ++ bool is_cross; // false ++ bool position_bias; // false ++ bool layernorm_post; // dont care ++ bool eft; // false - effective fast trn ++ int *padding_offset; ++ int *d_sequence_length; ++} encoderParamT; ++void CublasGemmWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const cudaDataType* data_types, void* alpha, void* beta, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); ++void CublasGemmStridedBatchedWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const int* strides, const cudaDataType* data_types, void* alpha, void* beta, int batch, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); ++template ++size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); ++ ++template ++size_t GetAttnWorkspaceSize(encoderParamT* param); ++template ++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws); ++template ++void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); ++// void forwardEncoder(std::vector > const* ++// inputs); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/CMakeLists.txt b/src/fastertransformer/layers/ms_layers/CMakeLists.txt +new file mode 100644 +index 0000000..36abaf8 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/CMakeLists.txt +@@ -0,0 +1,21 @@ ++# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++ ++cmake_minimum_required(VERSION 3.8) ++ ++add_library(MSLayer STATIC MSDecoderLayer.cc MSEncoderLayer.cc MSAttentionLayer.cc decoder.cc encoder.cc ffn.cc gemm.cc attention.cc) ++set_property(TARGET MSLayer PROPERTY POSITION_INDEPENDENT_CODE ON) ++set_property(TARGET MSLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) ++target_link_libraries(MSLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels ++ layernorm_kernels add_residual_kernels bert_preprocess_kernels) +diff --git a/src/fastertransformer/layers/ms_layers/MSAttentionLayer.cc b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.cc +new file mode 100755 +index 0000000..97daa1b +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.cc +@@ -0,0 +1,171 @@ ++/* ++ * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "src/fastertransformer/layers/ms_layers/MSAttentionLayer.h" ++ ++namespace fastertransformer { ++ ++template ++static void printTensor(char* str, T* input, int size) ++{ ++ printf("%s ", str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++ ++template ++MSMHALayer::MSMHALayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool is_cross, ++ bool sparse, ++ bool is_position_bias): ++ MSBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse) ++{ ++ cublasHandle_t cublas_handle; ++ cublasCreate(&cublas_handle); ++ cublasSetStream(cublas_handle, stream); ++ ++ params_.batch_size = max_batch_size; ++ params_.src_seq_len = max_src_seq_len; ++ params_.tgt_seq_len = max_tgt_seq_len; ++ params_.head_num = head_num; ++ params_.head_size = size_per_head; ++ params_.hidden_size = head_num * size_per_head; ++ params_.cublas_handle = cublas_handle; ++ params_.stream = stream; ++ // ctrls ++ params_.in_idx = 0; ++ params_.qkv_bias = !is_position_bias; ++ params_.projection_bias = !is_position_bias; ++ params_.is_cross = is_cross; ++ params_.position_bias = is_position_bias; ++ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++} ++template ++void MSMHALayer::allocateBuffer() ++{ ++ if (buf_ == nullptr) { ++ size_t buff_size = fastertransformer::GetAttnWorkspaceSize(¶ms_); ++ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, buff_size, true)); ++ } ++} ++template ++void MSMHALayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) ++{ ++ const AttentionLayerWeight* attention_weights = dynamic_cast*>(weights); ++ if(attention_weights == NULL){ ++ std::cout<<"cast EncoderLayerWeight not sucsses"; ++ } ++ allocateBuffer(); // only once ++ if (params_.position_bias) ++ if (params_.is_cross) { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)input_tensors->at(1).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)attention_weights->key_weight.kernel, ++ (void*)input_tensors->at(2).data, ++ (void*)input_tensors->at(3).data, ++ (void*)attention_weights->attention_output_weight.kernel}; ++ fastertransformer::forward_attn((T**)inputs, 7, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ else { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = { ++ (void*)input_tensors->at(0).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)input_tensors->at(1).data, ++ (void*)input_tensors->at(2).data, ++ (void*)attention_weights->attention_output_weight.kernel ++ }; ++ fastertransformer::forward_attn((T**)inputs, 5, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ else { ++ if (params_.is_cross) { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)input_tensors->at(1).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)attention_weights->key_weight.kernel, ++ (void*)attention_weights->query_weight.bias, ++ (void*)input_tensors->at(2).data, ++ (void*)attention_weights->attention_output_weight.kernel, ++ (void*)attention_weights->attention_output_weight.bias ++ }; ++ fastertransformer::forward_attn((T**)inputs, 8, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ else { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)attention_weights->query_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)attention_weights->attention_output_weight.kernel, ++ (void*)attention_weights->attention_output_weight.bias}; ++ fastertransformer::forward_attn((T**)inputs, 6, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ } ++} ++ ++ template ++ MSMHALayer::~MSMHALayer() ++ { ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++ } ++ ++ template ++ void MSMHALayer::freeBuffer() ++ { ++ if (buf_ != nullptr) { ++ allocator_->free(buf_); ++ buf_ = nullptr; ++ } ++ } ++ ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSAttentionLayer.h b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.h +new file mode 100755 +index 0000000..e448fb4 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.h +@@ -0,0 +1,63 @@ ++/* ++ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h" ++#include "src/fastertransformer/layers/ms_layers/attention.h" ++namespace fastertransformer { ++ ++// TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++template ++class MSMHALayer: public MSBaseLayer { ++private: ++ void allocateBuffer() override; ++ void freeBuffer() override; ++ ++ using MSBaseLayer::is_free_buffer_after_forward_; ++ using MSBaseLayer::is_allocate_buffer_; ++ using MSBaseLayer::cublas_wrapper_; ++ using MSBaseLayer::allocator_; ++ ++protected: ++ using MSBaseLayer::stream_; ++ using MSBaseLayer::sparse_; ++ T* buf_ = nullptr; ++ attentionParamT params_; ++ ++public: ++ MSMHALayer(size_t batch_size, ++ size_t src_seq_len, ++ size_t tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool is_cross, ++ bool sparse = false, ++ bool is_position_bias=false); ++ MSMHALayer(MSMHALayer const& attention_layer); ++ virtual ~MSMHALayer(); ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) override; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSBaseLayer.h b/src/fastertransformer/layers/ms_layers/MSBaseLayer.h +new file mode 100644 +index 0000000..4056480 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSBaseLayer.h +@@ -0,0 +1,76 @@ ++/* ++ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include ++#include ++ ++#include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" ++#include "src/fastertransformer/layers/BaseLayer.h" ++#include "src/fastertransformer/utils/Tensor.h" ++#include "src/fastertransformer/utils/allocator.h" ++#include "src/fastertransformer/utils/cublasMMWrapper.h" ++#include "src/fastertransformer/utils/memory_utils.h" ++#include "src/fastertransformer/layers/ms_layers/MSLayerWeight.h" ++ ++namespace fastertransformer { ++ ++enum class MSLayerType { ++ UNFUSED_MS_LAYER, ++ FUSED_MS_LAYER ++}; ++ ++template ++MSLayerType getMSLayerType(size_t size_per_head, const int sm, const bool remove_padding, ++ const int max_seq_len, const bool is_fuse = true) { ++ if (std::is_same::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72) ++ && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) { ++ return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER; ++ } else { ++ return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER; ++ } ++} ++ ++template ++MSLayerType getMSLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, ++ const int max_seq_len, const int int8_mode) { ++ if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64 ++ && max_seq_len <= 384) { ++ return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER; ++ } else { ++ return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER; ++ } ++} ++ ++template ++class MSBaseLayer: public BaseLayer { ++ ++public: ++ virtual void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* layer_weights) = 0; ++ MSBaseLayer(cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool sparse = false): ++ BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse) ++ { ++ } ++ virtual ~MSBaseLayer() = default; ++}; ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc +new file mode 100644 +index 0000000..2198115 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc +@@ -0,0 +1,210 @@ ++/* ++ * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "src/fastertransformer/layers/ms_layers/MSDecoderLayer.h" ++ ++namespace fastertransformer { ++template ++void printTensor(char* str, T* input, int size) ++{ ++ printf("%s ", str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++template ++MSDLayer::MSDLayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ float eps3, ++ bool post_layernorm, ++ bool position_bias1, ++ bool position_bias2, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse): ++ ++ MSBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr) ++{ ++ params_.batch_size = max_batch_size; ++ params_.src_seq_len = max_src_seq_len; ++ params_.tgt_seq_len = max_tgt_seq_len; ++ params_.head_num = head_num; ++ params_.head_size = size_per_head; ++ params_.hidden_size = head_num * size_per_head; ++ params_.ffn_hidden_size = ffn_hidden_size; ++ params_.eps1 = eps1; ++ params_.eps2 = eps2; ++ params_.eps3 = eps3; ++ params_.layernorm_post = post_layernorm; ++ // handle ++ params_.cublas_handle = *cublas_handle; ++ params_.stream = stream; ++ params_.ffn_fp16 = is_ffn_fp16; ++ // ctrls ++ params_.in_idx = 0; ++ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn1.in_idx = 0; ++ params_.attn1.batch_size = max_batch_size; ++ params_.attn1.src_seq_len = max_src_seq_len; ++ params_.attn1.tgt_seq_len = max_tgt_seq_len; ++ params_.attn1.head_num = head_num; ++ params_.attn1.head_size = size_per_head; ++ params_.attn1.hidden_size = head_num * size_per_head; ++ params_.attn1.qkv_bias = true; ++ params_.attn1.projection_bias = true; ++ params_.attn1.is_cross = false; ++ params_.attn1.position_bias = false; ++ params_.attn1.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn1.cublas_handle = *cublas_handle; ++ params_.attn1.stream = stream; ++ ++ params_.attn2.in_idx = 0; ++ params_.attn2.batch_size = max_batch_size; ++ params_.attn2.src_seq_len = max_src_seq_len; ++ params_.attn2.tgt_seq_len = max_tgt_seq_len; ++ params_.attn2.head_num = head_num; ++ params_.attn2.head_size = size_per_head; ++ params_.attn2.hidden_size = head_num * size_per_head; ++ params_.attn2.qkv_bias = true; ++ params_.attn2.projection_bias = true; ++ params_.attn2.is_cross = true; ++ params_.attn2.position_bias = false; ++ params_.attn2.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn2.cublas_handle = *cublas_handle; ++ params_.attn2.stream = stream; ++ ++} ++ ++template ++void MSDLayer::allocateBuffer() ++{ ++ if (buf_ == nullptr) { ++ size_t buff_size = fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); ++ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, buff_size, true)); ++ } ++} ++ ++template ++void MSDLayer::freeBuffer() ++{ ++ if (buf_ != nullptr) { ++ allocator_->free(buf_); ++ buf_ = nullptr; ++ } ++} ++ ++template ++MSDLayer::~MSDLayer() ++{ ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++} ++ ++template ++void MSDLayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) ++{ ++ const DecoderLayerWeight* decoder_weights = dynamic_cast*>(weights); ++ if(weights == NULL){ ++ std::cout<<"cast EncoderLayerWeight not sucsses"; ++ return ;} ++ allocateBuffer(); // only once ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ // std::cout<qkv_bias<< params_.attn2->qkv_bias<< !params_.attn1->position_bias<< !params_.attn2->position_bias<at(0).data, ++ (void*)decoder_weights->layernorm1.gamma, ++ (void*)decoder_weights->layernorm1.beta, ++ (void*)decoder_weights->attention.query_weight.kernel, ++ (void*)decoder_weights->attention.query_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)decoder_weights->attention.attention_output_weight.kernel, ++ (void*)decoder_weights->attention.attention_output_weight.bias, ++ (void*)decoder_weights->layernorm2.gamma, ++ (void*)decoder_weights->layernorm2.beta, ++ (void*)input_tensors->at(2).data, ++ (void*)decoder_weights->cross_attention.query_weight.kernel, ++ (void*)decoder_weights->cross_attention.key_weight.kernel, ++ (void*)decoder_weights->cross_attention.query_weight.bias, ++ (void*)input_tensors->at(3).data, ++ (void*)decoder_weights->cross_attention.attention_output_weight.kernel, ++ (void*)decoder_weights->cross_attention.attention_output_weight.bias, ++ (void*)decoder_weights->layernorm3.gamma, ++ (void*)decoder_weights->layernorm3.beta, ++ (void*)decoder_weights->decoder_output_mapping.kernel, ++ (void*)decoder_weights->decoder_output_mapping.bias, ++ (void*)decoder_weights->decoder_output_projection.kernel, ++ (void*)decoder_weights->decoder_output_projection.bias}; ++ fastertransformer::forwardDecoder(inputs, 23, outputs, 1, ¶ms_, buf_); ++ } ++ if (params_.attn1.position_bias && params_.attn2.position_bias) { ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)decoder_weights->layernorm1.gamma, ++ (void*)decoder_weights->attention.query_weight.kernel, ++ (void*)input_tensors->at(4).data, ++ (void*)input_tensors->at(1).data, ++ (void*)decoder_weights->attention.attention_output_weight.kernel, ++ (void*)decoder_weights->layernorm2.gamma, ++ (void*)input_tensors->at(2).data, ++ (void*)decoder_weights->cross_attention.query_weight.kernel, ++ (void*)decoder_weights->cross_attention.key_weight.kernel, ++ (void*)input_tensors->at(5).data, ++ (void*)input_tensors->at(3).data, ++ (void*)decoder_weights->cross_attention.attention_output_weight.kernel, ++ (void*)decoder_weights->layernorm3.gamma, ++ (void*)decoder_weights->decoder_output_mapping.kernel, ++ (void*)decoder_weights->decoder_output_projection.kernel}; ++ // fastertransformer::forwardDecoder(inputs, 23, outputs, 1, ¶ms_, buf_); ++ } ++ else{} ++ return; ++} ++ ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSDecoderLayer.h b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.h +new file mode 100644 +index 0000000..53d7675 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.h +@@ -0,0 +1,74 @@ ++/* ++ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h" ++#include "src/fastertransformer/layers/ms_layers/decoder.h" ++ ++namespace fastertransformer { ++ ++// TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++template ++class MSDLayer: public MSBaseLayer { ++private: ++ mutable decoderParamT params_; ++ ++ void allocateBuffer() override; ++ void freeBuffer() override; ++ void* buf_; ++ using MSBaseLayer::is_free_buffer_after_forward_; ++ using MSBaseLayer::is_allocate_buffer_; ++ using MSBaseLayer::cublas_wrapper_; ++ using MSBaseLayer::allocator_; ++ ++protected: ++ using MSBaseLayer::stream_; ++ using MSBaseLayer::sparse_; ++ ++public: ++ MSDLayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ float eps3, ++ bool post_layernorm, ++ bool position_bias1, ++ bool position_bias2, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse); ++ ++ MSDLayer(MSDLayer const& decoder_layer); ++ ++ virtual ~MSDLayer(); ++ ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) override; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc +new file mode 100644 +index 0000000..12b4657 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc +@@ -0,0 +1,215 @@ ++/* ++ * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "src/fastertransformer/layers/ms_layers/MSEncoderLayer.h" ++ ++namespace fastertransformer { ++template ++void printTensor(char* str, T* input, int size) ++{ ++ printf("%s ", str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++template ++MSELayer::MSELayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ bool post_layernorm, ++ bool position_bias, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse): ++ ++ MSBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr) ++{ ++ params_.batch_size = max_batch_size; ++ params_.src_seq_len = max_src_seq_len; ++ params_.tgt_seq_len = max_tgt_seq_len; ++ params_.head_num = head_num; ++ params_.head_size = size_per_head; ++ params_.hidden_size = head_num * size_per_head; ++ params_.ffn_hidden_size = ffn_hidden_size; ++ params_.eps1 = eps1; ++ params_.eps2 = eps2; ++ params_.layernorm_post = post_layernorm; ++ // handle ++ params_.cublas_handle = *cublas_handle; ++ params_.stream = stream; ++ params_.ffn_fp16 = is_ffn_fp16; ++ // ctrls ++ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.has_bias=!position_bias; ++ params_.has_beta=!position_bias; ++ params_.attn.in_idx = 0; ++ params_.attn.batch_size = max_batch_size; ++ params_.attn.src_seq_len = max_src_seq_len; ++ params_.attn.tgt_seq_len = max_tgt_seq_len; ++ params_.attn.head_num = head_num; ++ params_.attn.head_size = size_per_head; ++ params_.attn.hidden_size = head_num * size_per_head; ++ params_.attn.qkv_bias = !position_bias; ++ params_.attn.projection_bias = !position_bias; ++ params_.attn.is_cross = false; ++ std::cout<<"position_bias"< ++void MSELayer::allocateBuffer() ++{ ++ if (buf_ == nullptr) { ++ size_t buff_size = fastertransformer::GetEncoderLayerWorkspaceSize(¶ms_); ++ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, sizeof(T) * buff_size, true)); ++ } ++} ++ ++template ++void MSELayer::freeBuffer() ++{ ++ if (buf_ != nullptr) { ++ allocator_->free(buf_); ++ buf_ = nullptr; ++ } ++} ++ ++template ++MSELayer::~MSELayer() ++{ ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++} ++ ++template ++void MSELayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) ++{ ++ const EncoderLayerWeight* encoder_weights = dynamic_cast*>(weights); ++ // EncoderLayerWeight* encoder_weights = dynamic_cast*>(const_cast*>(weights)); ++ // const EncoderLayerWeight* encoder_weights = dynamic_cast*>(weights); ++ if(encoder_weights == NULL){ ++ std::cout<<"cast EncoderLayerWeight not sucsses"; ++ return ;} ++ allocateBuffer(); // only once ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ if (!params_.layernorm_post) { ++ if (params_.attn.position_bias) { ++ void* inputs[] = { ++ (void*)input_tensors->at(0).data, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->attention.query_weight.kernel, ++ (void*)input_tensors->at(1).data, ++ (void*)input_tensors->at(2).data, ++ (void*)encoder_weights->attention.attention_output_weight.kernel, ++ (void*)encoder_weights->layernorm2.gamma, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_projection.kernel ++ ++ }; ++ forwardEncoder(inputs, 9, outputs, 1, ¶ms_, buf_); ++ } ++ else{ ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->layernorm1.beta, ++ (void*)encoder_weights->attention.query_weight.kernel, ++ (void*)encoder_weights->attention.query_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)encoder_weights->attention.attention_output_weight.kernel, ++ (void*)encoder_weights->attention.attention_output_weight.bias, ++ (void*)encoder_weights->layernorm2.gamma, ++ (void*)encoder_weights->layernorm2.beta, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_mapping.bias, ++ (void*)encoder_weights->encoder_output_projection.kernel, ++ (void*)encoder_weights->encoder_output_projection.bias}; ++ fastertransformer::forwardEncoder(inputs, 14, outputs, 1, ¶ms_, buf_); ++ } ++ } ++ else { ++ if (params_.attn.position_bias) { ++ void* inputs[] = { ++ (void*)input_tensors->at(0).data, ++ (void*)encoder_weights->attention.query_weight.kernel, ++ (void*)input_tensors->at(1).data, ++ (void*)input_tensors->at(2).data, ++ (void*)encoder_weights->attention.attention_output_weight.kernel, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_projection.kernel, ++ (void*)encoder_weights->layernorm2.gamma ++ }; ++ forwardEncoder(inputs, 9, outputs, 1, ¶ms_, buf_); ++ } else { ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)encoder_weights->attention.query_weight.kernel, ++ (void*)encoder_weights->attention.query_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)encoder_weights->attention.attention_output_weight.kernel, ++ (void*)encoder_weights->attention.attention_output_weight.bias, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->layernorm1.beta, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_mapping.bias, ++ (void*)encoder_weights->encoder_output_projection.kernel, ++ (void*)encoder_weights->encoder_output_projection.bias, ++ (void*)encoder_weights->layernorm2.gamma, ++ (void*)encoder_weights->layernorm2.beta}; ++ fastertransformer::forwardEncoder(inputs, 3, outputs, 1, ¶ms_, buf_); ++ } ++ } ++ ++ return; ++} ++ ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSEncoderLayer.h b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.h +new file mode 100644 +index 0000000..95c598f +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.h +@@ -0,0 +1,71 @@ ++/* ++ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h" ++#include "src/fastertransformer/layers/ms_layers/encoder.h" ++ ++namespace fastertransformer { ++ ++// TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++template ++class MSELayer: public MSBaseLayer { ++private: ++ encoderParamT params_; ++ void allocateBuffer() override; ++ void freeBuffer() override; ++ void* buf_; ++ using MSBaseLayer::is_free_buffer_after_forward_; ++ using MSBaseLayer::is_allocate_buffer_; ++ using MSBaseLayer::cublas_wrapper_; ++ using MSBaseLayer::allocator_; ++ ++protected: ++ using MSBaseLayer::stream_; ++ using MSBaseLayer::sparse_; ++ ++public: ++ MSELayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ bool post_layernorm, ++ bool position_bias, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse); ++ ++ MSELayer(MSELayer const& encoder_layer); ++ ++ virtual ~MSELayer(); ++ ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) override; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSLayerWeight.h b/src/fastertransformer/layers/ms_layers/MSLayerWeight.h +new file mode 100644 +index 0000000..8915136 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSLayerWeight.h +@@ -0,0 +1,62 @@ ++/* ++ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/DenseWeight.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++namespace fastertransformer { ++ ++template ++ struct MSLayerWeight{ ++ virtual ~MSLayerWeight() {} ++}; ++template ++struct AttentionLayerWeight:MSLayerWeight{ ++ DenseWeight query_weight; ++ DenseWeight key_weight; ++ DenseWeight value_weight; ++ DenseWeight attention_output_weight; ++}; ++template ++struct DecoderLayerWeight:MSLayerWeight{ ++ AttentionLayerWeight attention; ++ AttentionLayerWeight cross_attention; ++ // DenseWeight attention_qkv_weight; ++ // DenseWeight attention_layer_output_weight; ++ // DenseWeight attention_cross_q_weight; ++ // DenseWeight attention_cross_kv_weight; ++ // DenseWeight attention_cross_layer_output_weight; ++ DenseWeight decoder_output_mapping; ++ DenseWeight decoder_output_projection; ++ LayerNormWeight layernorm1; ++ LayerNormWeight layernorm2; ++ LayerNormWeight layernorm3; ++}; ++ ++template ++struct EncoderLayerWeight:MSLayerWeight{ ++ AttentionLayerWeight attention; ++ // DenseWeight qkv_weight; ++ // DenseWeight attention_layer_output_weight; ++ DenseWeight encoder_output_mapping; ++ DenseWeight encoder_output_projection; ++ LayerNormWeight layernorm1; ++ LayerNormWeight layernorm2; ++}; ++ ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/attention.cc b/src/fastertransformer/layers/ms_layers/attention.cc +new file mode 100644 +index 0000000..40e8d6e +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/attention.cc +@@ -0,0 +1,300 @@ ++ ++#include "src/fastertransformer/layers/ms_layers/attention.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include ++namespace fastertransformer { ++ ++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) ++// #define UP_DIV(x, y) (x) ++#define ALIGN_SIZE 16 ++ ++template ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++ ++template ++void isNan(char* str, T* input, int size) ++{ ++ std::cout << str << " " << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } ++ } ++ ++ std::cout << std::endl; ++ free(input_host); ++} ++ ++ ++template ++size_t GetAttnWorkspaceSize(attentionParamT* param) ++{ ++ size_t size_q = UP_DIV((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_k = UP_DIV((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_v = size_k; ++ size_t qkv_len = size_q + size_k + size_v; ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ size_t attn_out_size = ++ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T); ++ ++} ++ ++template size_t GetAttnWorkspaceSize(attentionParamT* param); ++template size_t GetAttnWorkspaceSize(attentionParamT* param); ++ ++template ++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws) ++{ ++ param->in_idx = 0; ++ auto extra_tmp_size = ++ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_q = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_k = UP_DIV(param->batch_size * param->tgt_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_v = size_k; ++ ++ size_t qkv_len = size_q + size_k + size_v; ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ auto buff_size = qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len; ++ T* qkv_buf = (T*)ws; ++ T* q_buf_2 = static_cast(qkv_buf) + qkv_len; ++ T* qk_buf = static_cast(q_buf_2) + q_buf_2_len; ++ T* qkv_buf_2 = static_cast(qk_buf) + qk_buf_len; ++ T* qkv_buf_3 = static_cast(qkv_buf_2) + qkv_buf_2_len; ++ T* output1 = static_cast(ws) + buff_size; ++ T* output2 = static_cast(output1) + extra_tmp_size; ++ int gemm_dims[] = { ++ 3 * (int)param->hidden_size, (int)param->batch_size * (int)param->src_seq_len, (int)param->hidden_size}; ++ int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size}; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; ++ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; ++ if (std::is_same::value) { ++ gemm_data_types[0] = CUDA_R_16F; ++ gemm_data_types[1] = CUDA_R_16F; ++ gemm_data_types[2] = CUDA_R_16F; ++ } ++ T alpha = 1.0f; ++ T beta = 0.0f; ++ ++ if (param->is_cross) { ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->src_seq_len; ++ gemm_dims[2] = param->hidden_size; ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = param->hidden_size; ++ gemm_lds[2] = param->hidden_size; ++ T* encoder_output = reinterpret_cast(inputs[param->in_idx++]); ++ T* weight_q = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(weight_q, ++ from_tensor, ++ qkv_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ gemm_dims[0] = 2 * param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->tgt_seq_len; ++ gemm_lds[0] = 2 * param->hidden_size; ++ gemm_lds[2] = 2 * param->hidden_size; ++ T* weight_kv = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(weight_kv, ++ encoder_output, ++ qkv_buf + (param->batch_size * param->src_seq_len) * param->hidden_size, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ ++ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ invokeCrossAddFusedQKVBiasTranspose(q_buf_2, ++ output1, ++ output2, ++ qkv_buf, ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); ++ } ++ else { ++ T* weight_qkv = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(weight_qkv, ++ from_tensor, ++ qkv_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ fastertransformer::invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), ++ static_cast(output1), ++ static_cast(output2), ++ static_cast(qkv_buf), ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ 0, ++ param->stream); ++ } ++ gemm_ops[0] = CUBLAS_OP_T; ++ ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->head_size; ++ gemm_lds[2] = param->tgt_seq_len; ++ ++ int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->tgt_seq_len)}; ++ ++ gemm_dims[0] = param->tgt_seq_len; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->head_size; ++ ++ fastertransformer::CublasGemmStridedBatchedWrapper(output1, ++ q_buf_2, ++ qk_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ ++ T* attention_mask = reinterpret_cast(inputs[param->in_idx++]); ++ T* position_bias = (param->position_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ T scalar = static_cast(1.0f / sqrtf(param->head_size * 1.0f)); ++ fastertransformer::invokeMixMaskedSoftMax(static_cast(qk_buf), ++ attention_mask, ++ position_bias, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ scalar, ++ param->stream); ++ ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->head_size; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->tgt_seq_len; ++ ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->tgt_seq_len; ++ gemm_lds[2] = param->head_size; ++ ++ gemm_strides[0] = param->tgt_seq_len * param->head_size; ++ gemm_strides[1] = param->src_seq_len * param->tgt_seq_len; ++ gemm_strides[2] = param->src_seq_len * param->head_size; ++ fastertransformer::CublasGemmStridedBatchedWrapper(output2, ++ qk_buf, ++ qkv_buf_2, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ ++ invokeTransposeQKV(static_cast(qkv_buf_3), ++ static_cast(qkv_buf_2), ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->src_seq_len; ++ gemm_dims[2] = param->hidden_size; ++ ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = param->hidden_size; ++ gemm_lds[2] = param->hidden_size; ++ fastertransformer::CublasGemmWrapper(reinterpret_cast(inputs[param->in_idx++]), ++ qkv_buf_3, ++ static_cast(output[0]), ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ ++ if (param->projection_bias) { ++ int len = param->batch_size * param->src_seq_len; ++ invokeAddBias( ++ static_cast(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream); ++ } ++ return; ++} ++ ++template void ++forward_attn(float* inputs[], int in_len, float* output[], int out_len, attentionParamT* param, void* ws); ++template void ++forward_attn(half* inputs[], int in_len, half* output[], int out_len, attentionParamT* param, void* ws); ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/attention.h b/src/fastertransformer/layers/ms_layers/attention.h +new file mode 100644 +index 0000000..04623a1 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/attention.h +@@ -0,0 +1,19 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h" ++#include "src/fastertransformer/layers/ms_layers/param.h" ++#include "src/fastertransformer/layers/ms_layers/gemm.h" ++ ++#include ++#include ++ ++namespace fastertransformer { ++ ++ ++template ++size_t GetAttnWorkspaceSize(attentionParamT* param); ++ ++template ++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/decoder.cc b/src/fastertransformer/layers/ms_layers/decoder.cc +new file mode 100644 +index 0000000..3ee6389 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/decoder.cc +@@ -0,0 +1,216 @@ ++ ++#include "src/fastertransformer/layers/decoder_layers/decoder.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/attention.h" ++#include "src/fastertransformer/layers/ms_layers/ffn.h" ++ ++#include ++namespace fastertransformer { ++ ++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) ++// #define UP_DIV(x, y) (x) ++#define ALIGN_SIZE 16 ++ ++template ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) ++ std::cout << std::endl; + } -+ cublasGemmEx(cublas_handle, -+ trans_a, -+ trans_b, -+ m, -+ n, -+ k, -+ alpha, -+ a_addr, -+ type_a, -+ lda, -+ b_addr, -+ type_b, -+ ldb, -+ beta, -+ c_addr, -+ type_c, -+ ldc, -+ compute_type, -+ algo); ++ ++ std::cout << std::endl; ++ ++ free(input_host); +} + -+void CublasGemmStridedBatchedWrapper(const void* a_addr, -+ const void* b_addr, -+ void* c_addr, -+ const int* params, -+ const int* lds, -+ const cublasOperation_t* operations, -+ const int* strides, -+ const cudaDataType* data_types, -+ void* alpha, -+ void* beta, -+ int batch, -+ cublasHandle_t cublas_handle, -+ cublasGemmAlgo_t algo) ++template ++void isNan(char* str, T* input, int size) +{ -+ const int m = params[0]; -+ const int n = params[1]; -+ const int k = params[2]; -+ cublasOperation_t trans_a = operations[0]; -+ cublasOperation_t trans_b = operations[1]; -+ const int lda = lds[0]; -+ const int ldb = lds[1]; -+ const int ldc = lds[2]; -+ cudaDataType type_a = data_types[0]; -+ cudaDataType type_b = data_types[1]; -+ cudaDataType type_c = data_types[2]; -+ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; -+ // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F; ++ std::cout << str << " " << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); + -+ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { -+ compute_type = CUBLAS_COMPUTE_16F; ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } + } -+ const int stride_a = strides[0]; -+ const int stride_b = strides[1]; -+ const int stride_c = strides[2]; -+ cublasGemmStridedBatchedEx(cublas_handle, -+ trans_a, -+ trans_b, -+ m, -+ n, -+ k, -+ alpha, -+ a_addr, -+ type_a, -+ lda, -+ stride_a, -+ b_addr, -+ type_b, -+ ldb, -+ stride_b, -+ beta, -+ c_addr, -+ type_c, -+ ldc, -+ stride_c, -+ batch, -+ compute_type, -+ algo); ++ ++ std::cout << std::endl; ++ free(input_host); +} + +template -+size_t GetAttnWorkspaceSize(encoderParamT* param) ++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param) +{ -+ size_t size_q = ALIGN((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE); -+ size_t size_k = ALIGN((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE); -+ size_t size_v = size_k; -+ size_t qkv_len = size_q + size_k + size_v; -+ size_t qk_buf_len = -+ ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE); -+ size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ size_t attn_out_size = -+ ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE); -+ return (qkv_buf_2_len + 2 * attn_out_size + std::max(qkv_len, qk_buf_len)) * sizeof(T); ++ size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ size_t attn2_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ ++ size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); ++ size_t out_size = (param->layernorm_post) ? attn_out + attn2_out : attn_out * 2 + attn2_out * 2; ++ return (std::max(fastertransformer::GetAttnWorkspaceSize(&(param->attn1)) * 2, ffn_size * sizeof(T)) + out_size * sizeof(T)+ fastertransformer::GetAttnWorkspaceSize(&(param->attn1))*4); ++} ++ ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++template ++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws) ++{ ++ param->in_idx = 0; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ T* attn_out = reinterpret_cast(ws); ++ T* normed_from_tensor = reinterpret_cast(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* attn_ws = reinterpret_cast(normed_from_tensor) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn_out = normed_from_tensor; ++ T* attn2_out = reinterpret_cast(attn_ws) + fastertransformer::GetAttnWorkspaceSize(&(param->attn1)); ++ T* normed_from_tensor2 = reinterpret_cast(attn2_out) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* attn2_ws = reinterpret_cast(normed_from_tensor2) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn2_out = normed_from_tensor2; ++ T* ffn_ws = attn2_ws + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* tmp_out = reinterpret_cast(output[0]); ++ if (std::is_same::value && param->ffn_fp16==true) { ++ tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ } ++ T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta1 = reinterpret_cast(inputs[param->in_idx++]); ++ invokeGeneralLayerNorm(normed_from_tensor, ++ reinterpret_cast(from_tensor), // from tensor ++ gamma1, // Gamma ++ beta1, // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ inputs[--param->in_idx] = normed_from_tensor; ++ // if attention is embedded inside an decoder - fuse the bias to next layer normalization ++ int in_idx = param->in_idx; ++ bool projection_bias = param->attn1.projection_bias; ++ param->attn1.projection_bias=false; ++ ++ fastertransformer::forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws); ++ param->attn1.projection_bias = projection_bias; ++ param->in_idx = param->attn1.in_idx + in_idx; ++ if (param->attn1.projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta2 = reinterpret_cast(inputs[param->in_idx++]); ++ from_tensor = param->layernorm_post ? normed_from_tensor : from_tensor; ++ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, ++ normed_attn_out, ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ ++ } else { ++ // without projection bias ++ } ++ inputs[--param->in_idx] = normed_attn_out; ++ in_idx = param->in_idx; ++ projection_bias = param->attn2.projection_bias; ++ param->attn2.projection_bias=false; ++ fastertransformer::forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws); ++ ++ param->attn2.projection_bias = projection_bias; ++ param->in_idx = param->attn2.in_idx + in_idx; ++ if (param->attn2.projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma3 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta3 = reinterpret_cast(inputs[param->in_idx++]); ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeGeneralAddBiasResidualPreLayerNorm(attn2_out, ++ normed_attn2_out, ++ attn_out, ++ gamma3, // gamma ++ beta3, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps3); ++ ++ ++ } else { ++ invokeGeneralAddBiasResidualPreLayerNormCast(attn2_out, ++ reinterpret_cast(normed_attn2_out), ++ attn_out, ++ gamma3, // gamma ++ beta3, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps3); ++ } ++ } else { ++ // without projection bias ++ } ++ inputs[--param->in_idx] = normed_attn2_out; ++ if (param->ffn_fp16==false) { ++ fastertransformer::forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ ++ } else { ++ fastertransformer::forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } ++ attn2_out = param->layernorm_post ? normed_attn2_out : attn2_out; ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidual(reinterpret_cast(tmp_out), ++ attn2_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ ++ } else { ++ if(param->layernorm_post){ ++ invokeAddBiasResidualSameTypeCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ ++ } else{ ++ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ ++ } ++ } ++ return; ++} ++ ++template void ++forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++template void ++forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/decoder.h b/src/fastertransformer/layers/ms_layers/decoder.h +new file mode 100644 +index 0000000..7c2ea9e +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/decoder.h +@@ -0,0 +1,17 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/param.h" ++ ++#include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h" ++#include ++#include ++ ++namespace fastertransformer { ++ ++template ++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++template ++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/encoder.cc b/src/fastertransformer/layers/ms_layers/encoder.cc +new file mode 100644 +index 0000000..6e7f546 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/encoder.cc +@@ -0,0 +1,235 @@ ++ ++#include "src/fastertransformer/layers/ms_layers/encoder.h" ++#include "src/fastertransformer/layers/ms_layers/attention.h" ++#include "src/fastertransformer/layers/ms_layers/ffn.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include ++namespace fastertransformer { ++ ++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) ++// #define UP_DIV(x, y) (x) ++#define ALIGN_SIZE 16 ++ ++template ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++ ++template ++void isNan(char* str, T* input, int size) ++{ ++ std::cout << str << " " << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } ++ } ++ ++ std::cout << std::endl; ++ free(input_host); +} + -+template size_t GetAttnWorkspaceSize(encoderParamT* param); -+template size_t GetAttnWorkspaceSize(encoderParamT* param); +template +size_t GetEncoderLayerWorkspaceSize(encoderParamT* param) +{ -+ size_t max_hidden = ALIGN(std::max(param->hidden_size, param->ffn_hidden_size),ALIGN_SIZE); -+ size_t compress_buffer_len = ALIGN(param->batch_size * param->src_seq_len * max_hidden,ALIGN_SIZE); -+ size_t padding_len = ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE); -+ size_t offset_len = ALIGN(param->batch_size,ALIGN_SIZE); -+ size_t d_token_len = ALIGN(1,ALIGN_SIZE); -+ size_t eft_size = compress_buffer_len * sizeof(T) + (padding_len + offset_len) * sizeof(int) + d_token_len * sizeof(size_t); -+ size_t attn_out = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ size_t ffn = ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE); -+ return (std::max(GetAttnWorkspaceSize(param), ffn * sizeof(T)) + (attn_out * 3) * sizeof(T)) + eft_size; ++ size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); ++ size_t out_size = (param->layernorm_post) ? attn_out : attn_out * 2; ++ return (std::max(fastertransformer::GetAttnWorkspaceSize(&(param->attn)), ffn_size * sizeof(T)) + out_size * sizeof(T)); +} + +template size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); +template size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); + -+template -+void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws) -+{ -+ size_t inter_size = param->ffn_hidden_size; -+ size_t h_token_num = param->h_token_num; -+ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; -+ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; -+ if ((std::is_same::value) || (std::is_same::value)) { -+ gemm_data_types[0] = CUDA_R_16F; -+ gemm_data_types[1] = CUDA_R_16F; -+ gemm_data_types[2] = CUDA_R_16F; -+ } -+ S alpha = 1.0f; -+ S beta = 0.0f; -+ -+ int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size}; -+ int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size}; -+ T* normed_attn_out = reinterpret_cast(inputs[param->in_idx++]); -+ CublasGemmWrapper(inputs[param->in_idx++], -+ normed_attn_out, -+ ws, -+ gemm_dims, -+ gemm_lds, -+ gemm_ops, -+ gemm_data_types, -+ &alpha, -+ &beta, -+ param->cublas_handle, -+ param->algo); -+ invokeAddBiasGelu(reinterpret_cast(ws), -+ reinterpret_cast(inputs[param->in_idx++]), -+ h_token_num, -+ inter_size, -+ param->stream); -+ gemm_dims[0] = param->hidden_size; -+ gemm_dims[1] = h_token_num; -+ gemm_dims[2] = inter_size; -+ gemm_lds[0] = param->hidden_size; -+ gemm_lds[1] = inter_size; -+ gemm_lds[2] = param->hidden_size; -+ CublasGemmWrapper(inputs[param->in_idx++], -+ ws, -+ output[0], -+ gemm_dims, -+ gemm_lds, -+ gemm_ops, -+ gemm_data_types, -+ &alpha, -+ &beta, -+ param->cublas_handle, -+ param->algo); -+} -+ +template +void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws) +{ ++ // std::cout<has_bias<has_beta<attn.position_bias<attn.projection_bias<in_idx = 0; + size_t h_token_num = param->batch_size * param->src_seq_len; -+ param->h_token_num = h_token_num; -+ param->padding_offset = nullptr; -+ int* d_sequence_lengths = nullptr; -+ T* input_tensor = reinterpret_cast(inputs[param->in_idx++]); -+ T* from_tensor = input_tensor; -+ T* compress_buffer; -+ compress_buffer = reinterpret_cast(ws); -+ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(h_token_num * param->hidden_size,ALIGN_SIZE)); -+ int* padding_offset = reinterpret_cast(ws); -+ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE)); -+ d_sequence_lengths = reinterpret_cast(ws); -+ param->d_sequence_length = d_sequence_lengths; -+ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(param->batch_size,ALIGN_SIZE)); -+ size_t* d_token_num = reinterpret_cast(ws); -+ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(1,ALIGN_SIZE)); -+ invokeBuildSequnceLength( -+ from_tensor, param->batch_size, d_sequence_lengths, param->src_seq_len, param->hidden_size, param->stream); -+ // printTensor("seq_len=",d_sequence_lengths,param->batch_size); -+ invokeGetPaddingOffset(&h_token_num, -+ d_token_num, -+ padding_offset, -+ d_sequence_lengths, -+ param->batch_size, -+ param->src_seq_len, -+ param->stream); -+ // std::cout << "token=" << h_token_num << "m=" << param->batch_size * param->src_seq_len << std::endl; -+ if (h_token_num * 2 <= param->batch_size * param->src_seq_len) { -+ param->eft = true; -+ invokeRemovePadding(compress_buffer, -+ (const T*)from_tensor, -+ padding_offset, -+ h_token_num, -+ param->head_num * param->head_size, -+ param->stream); -+ param->h_token_num = h_token_num; -+ param->padding_offset = padding_offset; -+ from_tensor = compress_buffer; -+ } -+ h_token_num = param->h_token_num; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); + T* attn_out = reinterpret_cast(ws); -+ T* normed_from_tensor = -+ reinterpret_cast(ws) + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ T* normed_from_tensor = reinterpret_cast(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; + T* attn_ws_offset = (param->layernorm_post) ? reinterpret_cast(ws) : reinterpret_cast(normed_from_tensor); -+ T* attn_ws = attn_ws_offset + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ T* attn_ws = attn_ws_offset + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; + T* normed_attn_out = normed_from_tensor; -+ T* ffn_ws = normed_attn_out + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ ++ T* ffn_ws = normed_attn_out + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ + T* tmp_out = reinterpret_cast(output[0]); -+ if (param->padding_offset != nullptr || (std::is_same::value && param->ffn_fp16 == true)) { -+ tmp_out = ffn_ws + ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE); -+ } -+ T* tmp_out1 = reinterpret_cast(output[0]); -+ T* out_buf = tmp_out; -+ if (param->padding_offset != nullptr) { -+ tmp_out1 = compress_buffer; ++ if (std::is_same::value && param->ffn_fp16==true) { ++ tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; + } ++ + if (param->layernorm_post == false) { -+ T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); -+ T* beta1 = reinterpret_cast(inputs[param->in_idx++]); -+ -+ invokeGeneralLayerNorm(normed_from_tensor, -+ reinterpret_cast(from_tensor), // from tensor -+ gamma1, // Gamma -+ beta1, // Beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps1); -+ } -+ else { ++ T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta1 = (param->has_beta) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ invokeGeneralLayerNorm(normed_from_tensor, ++ reinterpret_cast(from_tensor), // from tensor ++ gamma1, // Gamma ++ beta1, // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ } else { + normed_from_tensor = from_tensor; + } ++ + inputs[--param->in_idx] = normed_from_tensor; + // if attention is embedded inside an encoder - fuse the bias to next layer normalization -+ bool projection_bias = param->projection_bias; -+ param->projection_bias = false; + int in_idx = param->in_idx; -+ forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn_out, 1, param, attn_ws); -+ param->in_idx += in_idx; -+ param->projection_bias = projection_bias; -+ if (param->projection_bias) { -+ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); -+ T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); -+ T* beta2 = reinterpret_cast(inputs[param->in_idx++]); -+ if (param->layernorm_post == false) { -+ if (std::is_same::value || param->ffn_fp16 == false) { -+ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, -+ normed_attn_out, -+ from_tensor, -+ gamma2, // gamma -+ beta2, // beta -+ projection_bias, -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ } -+ else { -+ invokeGeneralAddBiasResidualPreLayerNormCast(attn_out, -+ reinterpret_cast(normed_attn_out), -+ from_tensor, -+ gamma2, // gamma -+ beta2, // beta -+ projection_bias, -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ } ++ bool is_projection_bias = param->attn.projection_bias; ++ param->attn.projection_bias = false; ++ fastertransformer::forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn), attn_ws); ++ param->attn.projection_bias = is_projection_bias; ++ param->in_idx = param->attn.in_idx + in_idx; ++ // std::cout<<"index: "<in_idx<attn.projection_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta2 = (param->has_beta) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ // std::cout<<"index: "<in_idx<layernorm_post == false) { ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, ++ normed_attn_out, ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } else { ++ invokeGeneralAddBiasResidualPreLayerNormCast(attn_out, ++ reinterpret_cast(normed_attn_out), ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); + } -+ else { -+ if (std::is_same::value || param->ffn_fp16 == false) { -+ invokeAddBiasResidualLayerNorm(attn_out, -+ from_tensor, -+ projection_bias, -+ gamma2, // gamma -+ beta2, // beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps1); ++ } else { ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidualLayerNorm( ++ attn_out, ++ from_tensor, ++ projection_bias, ++ gamma2, // gamma ++ beta2, // beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); + normed_attn_out = attn_out; -+ } -+ else { -+ invokeAddBiasResidualLayerNormCast(reinterpret_cast(attn_out), -+ reinterpret_cast(normed_attn_out), -+ reinterpret_cast(from_tensor), -+ projection_bias, -+ gamma2, // gamma -+ beta2, // beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps1); ++ } else { ++ invokeAddBiasResidualLayerNormCast( ++ reinterpret_cast(attn_out), ++ reinterpret_cast(normed_attn_out), ++ reinterpret_cast(from_tensor), ++ projection_bias, ++ gamma2, // gamma ++ beta2, // beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); + // isNan((char*)"LN 1 model", (half*)attn_out, h_token_num * param->hidden_size); -+ } + } + } -+ else { -+ // without projection bias -+ } -+ // forward ffn ++ // forward ffn + // simulate attention inputs + inputs[--param->in_idx] = normed_attn_out; -+ if (param->ffn_fp16 == false) { -+ forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); -+ } -+ else { -+ forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ if (param->ffn_fp16==false) { ++ fastertransformer::forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ // std::cout<<"index: "<in_idx<(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); + } ++ T* ffn_bias = (param->attn.projection_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ // std::cout<<"index: "<in_idx<layernorm_post == true) { -+ if (std::is_same::value || param->ffn_fp16 == false) { ++ T* gamma3 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta3 = (param->has_beta) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ if (std::is_same::value || param->ffn_fp16==false) { + invokeAddBiasResidualLayerNorm(reinterpret_cast(tmp_out), -+ attn_out, -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, -+ reinterpret_cast(inputs[param->in_idx++]), // Gamma -+ reinterpret_cast(inputs[param->in_idx++]), // Beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ } -+ else { ++ attn_out, ++ ffn_bias, // FFN bias, ++ gamma3, // Gamma ++ beta3, // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ ++ } else { + invokeAddBiasResidualLayerNormCast( -+ reinterpret_cast(tmp_out), -+ reinterpret_cast(tmp_out1), -+ reinterpret_cast(normed_attn_out), -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, -+ reinterpret_cast(inputs[param->in_idx++]), // Gamma -+ reinterpret_cast(inputs[param->in_idx++]), // Beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ out_buf = tmp_out1; ++ reinterpret_cast(tmp_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(normed_attn_out), ++ ffn_bias, // FFN bias, ++ gamma3, // Gamma ++ beta3, // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); + } -+ } -+ else { -+ if (std::is_same::value || param->ffn_fp16 == false) { ++ } else { ++ if (std::is_same::value || param->ffn_fp16==false) { + invokeAddBiasResidual(reinterpret_cast(tmp_out), + attn_out, -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ ffn_bias, // FFN bias + h_token_num, + param->hidden_size, + param->stream); + } + else { -+ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), -+ reinterpret_cast(attn_out), -+ reinterpret_cast(tmp_out1), -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias -+ h_token_num, -+ param->hidden_size, -+ param->stream); -+ } -+ } -+ if (param->padding_offset != nullptr) { -+ cudaMemsetAsync(output[0], -+ 0, -+ param->batch_size * param->src_seq_len * param->head_size * param->head_num * sizeof(T), -+ param->stream); -+ invokeRebuildPadding( -+ (T*)output[0], out_buf, param->padding_offset, h_token_num, param->hidden_size, param->stream); ++ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn_out), ++ reinterpret_cast(output[0]), ++ ffn_bias, // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } + } ++ + return; +} + @@ -6805,323 +10409,373 @@ index 0000000..004718e +forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); +template void +forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/encoder.h b/src/fastertransformer/layers/ms_layers/encoder.h +new file mode 100644 +index 0000000..bdfa7be +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/encoder.h +@@ -0,0 +1,16 @@ ++#pragma once + -+template -+void forward_attn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws) -+{ -+ param->in_idx = 0; -+ auto extra_tmp_size = -+ ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE); -+ size_t size_q = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ size_t q_buf_2_len = size_q; -+ size_t qk_buf_len = -+ ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE); -+ size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ T* q_buf_2 = (T*)ws; -+ T* output1 = static_cast(ws) + q_buf_2_len; -+ T* output2 = static_cast(output1) + extra_tmp_size; -+ T* qkv_buf = static_cast(output2) + extra_tmp_size; -+ T* qk_buf = qkv_buf; -+ T* qkv_buf_2 = q_buf_2; -+ T* qkv_buf_3 = qk_buf; -+ int gemm_dims[] = {3 * (int)param->hidden_size, (int)param->h_token_num, (int)param->hidden_size}; -+ int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size}; -+ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); -+ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; -+ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; -+ if (std::is_same::value) { -+ gemm_data_types[0] = CUDA_R_16F; -+ gemm_data_types[1] = CUDA_R_16F; -+ gemm_data_types[2] = CUDA_R_16F; -+ } -+ T alpha = 1.0f; -+ T beta = 0.0f; -+ -+ if (param->is_cross) { -+ gemm_dims[0] = param->hidden_size; -+ gemm_dims[1] = param->batch_size * param->src_seq_len; -+ gemm_dims[2] = param->hidden_size; -+ gemm_lds[0] = param->hidden_size; -+ gemm_lds[1] = param->hidden_size; -+ gemm_lds[2] = param->hidden_size; -+ T* encoder_output = reinterpret_cast(inputs[param->in_idx++]); -+ T* weight_q = reinterpret_cast(inputs[param->in_idx++]); ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h" ++#include "src/fastertransformer/layers/ms_layers/param.h" ++#include ++#include + -+ CublasGemmWrapper(weight_q, -+ from_tensor, -+ qkv_buf, -+ gemm_dims, -+ gemm_lds, -+ gemm_ops, -+ gemm_data_types, -+ &alpha, -+ &beta, -+ param->cublas_handle, -+ param->algo); -+ gemm_dims[0] = 2 * param->hidden_size; -+ gemm_dims[1] = param->batch_size * param->tgt_seq_len; -+ gemm_lds[0] = 2 * param->hidden_size; -+ gemm_lds[2] = 2 * param->hidden_size; -+ T* weight_kv = reinterpret_cast(inputs[param->in_idx++]); ++namespace fastertransformer { + -+ CublasGemmWrapper(weight_kv, -+ encoder_output, -+ qkv_buf + (param->batch_size * param->src_seq_len) * param->hidden_size, -+ gemm_dims, -+ gemm_lds, -+ gemm_ops, -+ gemm_data_types, -+ &alpha, -+ &beta, -+ param->cublas_handle, -+ param->algo); ++template ++size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); + -+ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; -+ invokeCrossAddFusedQKVBiasTranspose(q_buf_2, -+ output1, -+ output2, -+ qkv_buf, -+ bias_qkv, -+ param->batch_size, -+ param->src_seq_len, -+ param->tgt_seq_len, -+ param->head_num, -+ param->head_size, -+ param->stream); -+ } -+ else { -+ T* weight_qkv = reinterpret_cast(inputs[param->in_idx++]); -+ CublasGemmWrapper(weight_qkv, -+ from_tensor, -+ qkv_buf, -+ gemm_dims, -+ gemm_lds, -+ gemm_ops, -+ const_cast(gemm_data_types), -+ &alpha, -+ &beta, -+ param->cublas_handle, -+ param->algo); ++template ++void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/ffn.cc b/src/fastertransformer/layers/ms_layers/ffn.cc +new file mode 100644 +index 0000000..9dc7f04 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/ffn.cc +@@ -0,0 +1,114 @@ + -+ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; -+ if (param->padding_offset == nullptr) { -+ invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), -+ static_cast(output1), -+ static_cast(output2), -+ static_cast(qkv_buf), -+ bias_qkv, -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ 0, -+ param->stream); -+ } -+ else { -+ invokeAddFusedZP_QKVBiasTranspose(static_cast(q_buf_2), -+ static_cast(output1), -+ static_cast(output2), -+ static_cast(qkv_buf), -+ bias_qkv, -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ param->h_token_num, -+ param->padding_offset, -+ param->stream); -+ } -+ } -+ gemm_ops[0] = CUBLAS_OP_T; -+ gemm_ops[1] = CUBLAS_OP_N; -+ gemm_dims[0] = param->tgt_seq_len; -+ gemm_dims[1] = param->src_seq_len; -+ gemm_dims[2] = param->head_size; ++#include "src/fastertransformer/layers/ms_layers/ffn.h" ++#include "src/fastertransformer/layers/ms_layers/gemm.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include ++namespace fastertransformer { + -+ gemm_lds[0] = param->head_size; -+ gemm_lds[1] = param->head_size; -+ gemm_lds[2] = param->tgt_seq_len; ++template ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); + -+ int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size), -+ (int)(param->src_seq_len * param->head_size), -+ (int)(param->src_seq_len * param->tgt_seq_len)}; ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); + -+ CublasGemmStridedBatchedWrapper(output1, -+ q_buf_2, -+ qk_buf, -+ gemm_dims, -+ gemm_lds, -+ gemm_ops, -+ gemm_strides, -+ const_cast(gemm_data_types), -+ &alpha, -+ &beta, -+ param->batch_size * param->head_num, -+ param->cublas_handle, -+ param->algo); ++ for (int k = 0; k < (int)size; k++) { + -+ T* attention_mask = reinterpret_cast(inputs[param->in_idx++]); -+ if (param->padding_offset != nullptr) -+ invokeBuildEncoderAttentionMask( -+ attention_mask, param->d_sequence_length, param->batch_size, param->src_seq_len, param->stream); -+ T* position_bias = nullptr; -+ if (param->position_bias) { -+ position_bias = reinterpret_cast(inputs[param->in_idx++]); ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) ++ std::cout << std::endl; + } -+ T scalar = static_cast(1.0f / sqrtf(param->head_size * 1.0f)); -+ invokeMixMaskedSoftMax(static_cast(qk_buf), -+ attention_mask, -+ position_bias, -+ param->batch_size, -+ param->src_seq_len, -+ param->tgt_seq_len, -+ param->head_num, -+ scalar, -+ param->stream); + -+ gemm_ops[0] = CUBLAS_OP_N; -+ gemm_ops[1] = CUBLAS_OP_N; -+ gemm_dims[0] = param->head_size; -+ gemm_dims[1] = param->src_seq_len; -+ gemm_dims[2] = param->tgt_seq_len; ++ std::cout << std::endl; + -+ gemm_lds[0] = param->head_size; -+ gemm_lds[1] = param->tgt_seq_len; -+ gemm_lds[2] = param->head_size; ++ free(input_host); ++} + -+ gemm_strides[0] = param->tgt_seq_len * param->head_size; -+ gemm_strides[1] = param->src_seq_len * param->tgt_seq_len; -+ gemm_strides[2] = param->src_seq_len * param->head_size; ++template ++void isNan(char* str, T* input, int size) ++{ ++ std::cout << str << " " << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); + -+ CublasGemmStridedBatchedWrapper(output2, -+ qk_buf, -+ qkv_buf_2, -+ gemm_dims, -+ gemm_lds, -+ gemm_ops, -+ gemm_strides, -+ const_cast(gemm_data_types), -+ &alpha, -+ &beta, -+ param->batch_size * param->head_num, -+ param->cublas_handle, -+ param->algo); ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); + -+ if (param->padding_offset == nullptr) { -+ invokeTransposeQKV(static_cast(qkv_buf_3), -+ static_cast(qkv_buf_2), -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ param->stream); -+ } -+ else { -+ invokeTransposeAttentionOutRemovePadding(qkv_buf_2, -+ qkv_buf_3, -+ param->h_token_num, -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ param->padding_offset, -+ param->stream); ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } + } -+ gemm_ops[0] = CUBLAS_OP_N; -+ gemm_ops[1] = CUBLAS_OP_N; -+ gemm_dims[0] = param->hidden_size; -+ gemm_dims[1] = param->h_token_num; -+ gemm_dims[2] = param->hidden_size; + ++ std::cout << std::endl; ++ free(input_host); ++} ++ ++template ++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws) ++{ ++ size_t inter_size = param->ffn_hidden_size; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; ++ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; ++ if ((std::is_same::value) || (std::is_same::value)) { ++ gemm_data_types[0] = CUDA_R_16F; ++ gemm_data_types[1] = CUDA_R_16F; ++ gemm_data_types[2] = CUDA_R_16F; ++ } ++ S alpha = 1.0f; ++ S beta = 0.0f; ++ ++ int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size}; ++ int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size}; ++ T* normed_attn_out = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(inputs[param->in_idx++], ++ normed_attn_out, ++ ws, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ S* bias = (param->has_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ invokeAddBiasGelu(reinterpret_cast(ws), ++ bias, ++ h_token_num, ++ inter_size, ++ param->stream); ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = h_token_num; ++ gemm_dims[2] = inter_size; + gemm_lds[0] = param->hidden_size; -+ gemm_lds[1] = param->hidden_size; ++ gemm_lds[1] = inter_size; + gemm_lds[2] = param->hidden_size; -+ CublasGemmWrapper(reinterpret_cast(inputs[param->in_idx++]), -+ qkv_buf_3, -+ static_cast(output[0]), ++ fastertransformer::CublasGemmWrapper(inputs[param->in_idx++], ++ ws, ++ output[0], + gemm_dims, + gemm_lds, + gemm_ops, -+ const_cast(gemm_data_types), ++ gemm_data_types, + &alpha, + &beta, + param->cublas_handle, + param->algo); -+ if (param->projection_bias) { -+ int len = param->h_token_num; -+ invokeAddBias( -+ static_cast(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream); -+ } -+ return; +} + -+template void -+forward_attn(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws); -+template void -+forward_attn(half* inputs[], int in_len, half* output[], int out_len, encoderParamT* param, void* ws); + +template void -+forward_ffn(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws); ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws); +template void -+forward_ffn(half* inputs[], int in_len, half* output[], int out_len, encoderParamT* param, void* ws); ++forward_ffn(half* inputs[], int in_len, half* output[], int out_len, ParamT* param, void* ws); +template void -+forward_ffn(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws); ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws); +} // namespace fastertransformer -diff --git a/src/fastertransformer/layers/encoder_layers/encoder.h b/src/fastertransformer/layers/encoder_layers/encoder.h +diff --git a/src/fastertransformer/layers/ms_layers/ffn.h b/src/fastertransformer/layers/ms_layers/ffn.h new file mode 100644 -index 0000000..ffba081 +index 0000000..9498dc8 --- /dev/null -+++ b/src/fastertransformer/layers/encoder_layers/encoder.h -@@ -0,0 +1,49 @@ ++++ b/src/fastertransformer/layers/ms_layers/ffn.h +@@ -0,0 +1,14 @@ +#pragma once + +#include "src/fastertransformer/kernels/activation_kernels.h" -+#include "src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h" ++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h" ++#include "src/fastertransformer/layers/ms_layers/param.h" ++ +#include +#include + +namespace fastertransformer { + -+typedef struct { -+ size_t batch_size; -+ size_t src_seq_len; -+ size_t tgt_seq_len; -+ size_t head_num; -+ size_t head_size; -+ size_t hidden_size; -+ size_t h_token_num; -+ size_t ffn_hidden_size; // 4 * param->hidden_size; ++template ++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/gemm.cc b/src/fastertransformer/layers/ms_layers/gemm.cc +new file mode 100644 +index 0000000..aabafb7 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/gemm.cc +@@ -0,0 +1,117 @@ ++ ++#include "src/fastertransformer/layers/ms_layers/gemm.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include ++namespace fastertransformer { ++ ++void CublasGemmWrapper(const void* a_addr, ++ const void* b_addr, ++ void* c_addr, ++ const int* params, ++ const int* lds, ++ const cublasOperation_t* operations, ++ const cudaDataType* data_types, ++ void* alpha, ++ void* beta, ++ cublasHandle_t cublas_handle, ++ cublasGemmAlgo_t algo) ++{ ++ const int m = params[0]; ++ const int n = params[1]; ++ const int k = params[2]; ++ cublasOperation_t trans_a = operations[0]; ++ cublasOperation_t trans_b = operations[1]; ++ const int lda = lds[0]; ++ const int ldb = lds[1]; ++ const int ldc = lds[2]; ++ cudaDataType type_a = data_types[0]; ++ cudaDataType type_b = data_types[1]; ++ cudaDataType type_c = data_types[2]; ++ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; ++ // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F; ++ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { ++ compute_type = CUBLAS_COMPUTE_16F; ++ } ++ cublasGemmEx(cublas_handle, ++ trans_a, ++ trans_b, ++ m, ++ n, ++ k, ++ alpha, ++ a_addr, ++ type_a, ++ lda, ++ b_addr, ++ type_b, ++ ldb, ++ beta, ++ c_addr, ++ type_c, ++ ldc, ++ compute_type, ++ algo); ++} ++ ++void CublasGemmStridedBatchedWrapper(const void* a_addr, ++ const void* b_addr, ++ void* c_addr, ++ const int* params, ++ const int* lds, ++ const cublasOperation_t* operations, ++ const int* strides, ++ const cudaDataType* data_types, ++ void* alpha, ++ void* beta, ++ int batch, ++ cublasHandle_t cublas_handle, ++ cublasGemmAlgo_t algo) ++{ ++ const int m = params[0]; ++ const int n = params[1]; ++ const int k = params[2]; ++ cublasOperation_t trans_a = operations[0]; ++ cublasOperation_t trans_b = operations[1]; ++ const int lda = lds[0]; ++ const int ldb = lds[1]; ++ const int ldc = lds[2]; ++ cudaDataType type_a = data_types[0]; ++ cudaDataType type_b = data_types[1]; ++ cudaDataType type_c = data_types[2]; ++ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; ++ // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F; ++ ++ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { ++ compute_type = CUBLAS_COMPUTE_16F; ++ } ++ const int stride_a = strides[0]; ++ const int stride_b = strides[1]; ++ const int stride_c = strides[2]; ++ cublasGemmStridedBatchedEx(cublas_handle, ++ trans_a, ++ trans_b, ++ m, ++ n, ++ k, ++ alpha, ++ a_addr, ++ type_a, ++ lda, ++ stride_a, ++ b_addr, ++ type_b, ++ ldb, ++ stride_b, ++ beta, ++ c_addr, ++ type_c, ++ ldc, ++ stride_c, ++ batch, ++ compute_type, ++ algo); ++} ++ ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/gemm.h b/src/fastertransformer/layers/ms_layers/gemm.h +new file mode 100644 +index 0000000..21dd35c +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/gemm.h +@@ -0,0 +1,13 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h" ++#include ++#include ++ ++namespace fastertransformer { ++ ++void CublasGemmWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const cudaDataType* data_types, void* alpha, void* beta, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); ++void CublasGemmStridedBatchedWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const int* strides, const cudaDataType* data_types, void* alpha, void* beta, int batch, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/param.h b/src/fastertransformer/layers/ms_layers/param.h +new file mode 100644 +index 0000000..09af694 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/param.h +@@ -0,0 +1,55 @@ ++#pragma once ++namespace fastertransformer { ++typedef struct{ ++ public: ++ size_t batch_size; ++ size_t src_seq_len; ++ size_t tgt_seq_len; ++ size_t head_num; ++ size_t head_size; ++ size_t hidden_size; ++ size_t h_token_num; ++ // handle ++ cublasHandle_t cublas_handle; ++ cudaStream_t stream; ++ cublasGemmAlgo_t algo; ++ size_t ffn_hidden_size; ++ // ctrls ++ int *padding_offset; ++ int in_idx; ++ bool has_bias; ++ ++} ParamT; ++ ++typedef struct : ParamT{ ++ bool qkv_bias; // ture ++ bool projection_bias; // ture ++ bool is_cross; // false ++ bool position_bias; ++ int *padding_offset; ++} attentionParamT; ++ ++typedef struct : ParamT{ ++ + bool ffn_fp16; + float eps1; + float eps2; -+ // handle -+ cublasHandle_t cublas_handle; -+ cudaStream_t stream; -+ cublasGemmAlgo_t algo; -+ // ctrls -+ int in_idx; -+ bool qkv_bias; // true -+ bool projection_bias; // true -+ bool is_cross; // false -+ bool position_bias; // false -+ bool layernorm_post; // dont care -+ bool eft; // false - effective fast trn ++ float eps3; ++ attentionParamT attn1; ++ attentionParamT attn2; ++ bool layernorm_post; ++ bool has_beta; + int *padding_offset; -+ int *d_sequence_length; -+} encoderParamT; ++} decoderParamT; + -+template -+size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); ++typedef struct : ParamT{ + -+template -+size_t GetAttnWorkspaceSize(encoderParamT* param); -+template -+void forward_attn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws); -+template -+void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); -+// void forwardEncoder(std::vector > const* -+// inputs); -+} // namespace fastertransformer ++ bool ffn_fp16; ++ float eps1; ++ float eps2; ++ attentionParamT attn; ++ bool layernorm_post; ++ bool has_beta; ++ int *padding_offset; ++} encoderParamT; ++} +\ No newline at end of file diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt index af33e76..97fc471 100644 --- a/src/fastertransformer/models/CMakeLists.txt diff --git a/trc/transformer/MultiHeadTester.py b/trc/transformer/MultiHeadTester.py old mode 100644 new mode 100755 index bfc72ef3123496f5b70ccdd0b0a08d1d680bb915..286bc5b75d17d7003b70b68a045b6fa6a2e00bfe --- a/trc/transformer/MultiHeadTester.py +++ b/trc/transformer/MultiHeadTester.py @@ -24,7 +24,8 @@ __all__ = [ "MultiHeadAttentionX", "FeedForwardX", "TransformerEncoderLayerX", - "_LayerNormX" + "_LayerNormX", + "TransformerDecoderLayerX" ] @@ -44,7 +45,7 @@ class _LayerNormX(Cell): Tensor of shape :math:`(batch, seq_length, hidden_size)`. """ - def __init__(self, normalized_shape, eps=1e-4, param_init_type=mstype.float32, is_self_defined=True): + def __init__(self, normalized_shape, eps=1e-4, param_init_type=mstype.float32, is_self_defined=False): super(_LayerNormX, self).__init__() if param_init_type not in [mstype.float32, mstype.float16]: raise TypeError("The type of parameter 'param_init_type' should in [float32, float16], " @@ -540,39 +541,39 @@ class MultiHeadAttentionX(Cell): # # key and value for current token(s) key_present = key value_present = value - # if self.use_past: - # # The first graph with the input size of (bs, seq_length) - # if self.is_first_iteration: - # # Get the valid input length without padding - # valid_length_vector = F.cast(self.less(self.range, batch_valid_length.view(-1, 1, 1)), self.dtype) - # # Cover the key and value numbers corresponding to the padding position - # key_present = self.mul1(key, self.expand_dims(valid_length_vector, 2)) - # value_present = self.mul1(value, self.expand_dims(valid_length_vector, 3)) - # # The second graph with the inpus size of (bs, 1) - # # the shape of query is (bs, num_heads, 1, size_per_head) - # # the shape of key is (bs, num_heads, size_per_head, 1) - # # the shape of value is (bs, num_heads, 1, size_per_head) - # else: - # # Get the current token position index - # valid_length = self.reducesum(F.cast(self.not_equal(self.slice(key_past, (0, 0, 0, 0), - # (F.shape(key_tensor)[0], 1, 1, - # self.src_seq_length), - # (1, 1, 1, 1)), - # 0), mstype.float32), (1, 2, 3)) - # valid_length = F.reshape(valid_length, (-1, 1, 1)) - # valid_length_vector = F.cast(self.equal(valid_length, self.range), self.dtype) - # # Pad the key and value to seq_length with only the position index not zero - # current_key = self.mul1(self.tile(key, (1, 1, 1, self.seq_length)), - # self.expand_dims(valid_length_vector, 2)) - # current_value = self.mul1(self.tile(value, (1, 1, self.seq_length, 1)), - # self.expand_dims(valid_length_vector, 3)) - # # Concat the previous saved state and current state - # key = self.add(key_past, current_key) - # value = self.add(value_past, current_value) - # # Update key_present and value_present for state update - # key_present = key - # value_present = value - # attention_mask = F.reshape(self.attention_mask, (self.seq_length, self.seq_length, 1, 1)) + if self.use_past: + # The first graph with the input size of (bs, seq_length) + if self.is_first_iteration: + # Get the valid input length without padding + valid_length_vector = F.cast(self.less(self.range, batch_valid_length.view(-1, 1, 1)), self.dtype) + # Cover the key and value numbers corresponding to the padding position + key_present = self.mul1(key, self.expand_dims(valid_length_vector, 2)) + value_present = self.mul1(value, self.expand_dims(valid_length_vector, 3)) + # The second graph with the inpus size of (bs, 1) + # the shape of query is (bs, num_heads, 1, size_per_head) + # the shape of key is (bs, num_heads, size_per_head, 1) + # the shape of value is (bs, num_heads, 1, size_per_head) + else: + # Get the current token position index + valid_length = self.reducesum(F.cast(self.not_equal(self.slice(key_past, (0, 0, 0, 0), + (F.shape(key_tensor)[0], 1, 1, + self.src_seq_length), + (1, 1, 1, 1)), + 0), mstype.float32), (1, 2, 3)) + valid_length = F.reshape(valid_length, (-1, 1, 1)) + valid_length_vector = F.cast(self.equal(valid_length, self.range), self.dtype) + # Pad the key and value to seq_length with only the position index not zero + current_key = self.mul1(self.tile(key, (1, 1, 1, self.seq_length)), + self.expand_dims(valid_length_vector, 2)) + current_value = self.mul1(self.tile(value, (1, 1, self.seq_length, 1)), + self.expand_dims(valid_length_vector, 3)) + # Concat the previous saved state and current state + key = self.add(key_past, current_key) + value = self.add(value_past, current_value) + # Update key_present and value_present for state update + key_present = key + value_present = value + attention_mask = F.reshape(self.attention_mask, (self.seq_length, self.seq_length, 1, 1)) layer_present = (key_present, value_present) # # multi head attention considering attention mask @@ -857,9 +858,9 @@ class FeedForwardX(Cell): param_init_type=mstype.float32, parallel_config=default_dpmp_config): super(FeedForwardX, self).__init__() - if hidden_act is None or not (isinstance(hidden_act, str) or issubclass(hidden_act, nn.Cell)): - raise TypeError(f"For FeedForward cell, the hidden_act should str type or nn.Cell type, " - f"but got {hidden_act}.") + # if hidden_act is None or not (isinstance(hidden_act, str) or issubclass(hidden_act, nn.Cell)): + # raise TypeError(f"For FeedForward cell, the hidden_act should str type or nn.Cell type, " + # f"but got {hidden_act}.") if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation(): _check_config(parallel_config) mp = parallel_config.model_parallel @@ -886,7 +887,7 @@ class FeedForwardX(Cell): # Project to ffn_hidden_size self.mapping = _Linear(in_channels=input_size, out_channels=output_size, - activation=hidden_act, + activation=None, transpose_b=False, # expert_num=expert_num, # expert_group_size=expert_group_size, @@ -1309,6 +1310,7 @@ class TransformerEncoderLayerX(Cell): else: input_x = self.layernorm1(x) input_x = F.cast(input_x, self.dtype) + # indicate whether reset saved states key_reset = None value_reset = None @@ -1415,3 +1417,462 @@ class TransformerEncoderLayerX(Cell): _check_input_dtype(F.dtype(batch_valid_length), "batch_valid_length", [mstype.int32], self.cls_name) return True +class TransformerDecoderLayerX(Cell): + r""" + Transformer Decoder Layer. This is an implementation of the single layer of the transformer + decoder layer, including self-attention, cross attention and feedward layer. When the encoder_output is None, + the cross attention will not be effective. + + Args: + hidden_size(int): The hidden size of the input. + ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer. + num_heads(int): The number of the heads. + batch_size(int): The batch size of the input tensor. + src_seq_length(int): The input source sequence length. + tgt_seq_length(int): The input target sequence length. + attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1. + hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1. + post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False. + use_past(bool): Use the past state to compute, used for incremental prediction. Default False. + layernorm_compute_type(dtype.Number): The computation type of the layernorm. + Should be dtype.float32 or dtype.float16. Default dtype.float32. + softmax_compute_type(dtype.Number): The computation type of the softmax in the attention. + Should be dtype.float32 or dtype.float16. Default mstype.float32. + param_init_type(dtype.Number): The parameter initialization type of the module. + Should be dtype.float32 or dtype.float16. Default dtype.float32. + hidden_act(str): The activation of the internal feedforward layer. Supports 'relu', + 'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish', + 'hsigmoid', 'logsigmoid' and so on. Default: gelu. + moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). Default is an instance of MoEConfig + with default values. Please see `MoEConfig`. + parallel_config(OpParallelConfig, MoEParallelConfig): The parallel configure. When MoE is applied, + MoEParallelConfig is effective, otherwise OpParallelConfig is effective. Default `default_dpmp_config`, + an instance of `OpParallelConfig` with default args. + + Inputs: + - **hidden_stats** (Tensor) - The input tensor with shape [batch_size, tgt_seq_length, hidden_size] or + [batch_size * tgt_seq_length, hidden_size]. + - **decoder_mask** (Tensor) - The attention mask for decoder with shape [batch_size, src_seq_length, + seq_length]. + - **encoder_output** (Tensor) - The output of the encoder with shape [batch_size, seq_length, hidden_size] + or [batch_size * seq_length, hidden_size]. + Note this args can not be passed by None when the net is in outermost layer. Default None. + - **memory_mask** (Tensor) - The memory mask of the cross attention with shape [batch, tgt_seq_length, + src_seq_length] where tgt_seq_length is the length of the decoder. Note this args can not be passed by + None when the net is in outermost layer. Default None. + - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and + past value parameter used in the incremental prediction. Only valid when use_past is True. Default True. + - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. + Used for incremental prediction when the use_past is True. Default None. + + Outputs: + Tuple, a tuple contains(`output`, `layer_present`) + + - **output** (Tensor) - The output logit of this layer. The shape is [batch, seq_length, hidden_size] or + [batch * seq_length, hidden_size]. + - **layer_present** (Tuple) - A tuple, where each tuple is the tensor of the projected key and value + vector in self attention with shape ((batch_size, num_heads, size_per_head, tgt_seq_length), + (batch_size, num_heads, tgt_seq_length, size_per_head), and of the projected key and value vector + in cross attention with shape (batch_size, num_heads, size_per_head, src_seq_length), + (batch_size, num_heads, src_seq_length, size_per_head)). + + Supported Platforms: + ``Ascend`` ``GPU`` + + Examples: + >>> import numpy as np + >>> from mindspore import dtype as mstype + >>> from mindspore.nn.transformer import TransformerDecoderLayer + >>> from mindspore import Tensor + >>> model = TransformerDecoderLayer(batch_size=2, hidden_size=64, ffn_hidden_size=64, num_heads=2, + ... src_seq_length=20, tgt_seq_length=10) + >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32) + >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32) + >>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16) + >>> memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16) + >>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask) + >>> print(output.shape) + (2, 10, 64) + >>> print(past[0].shape) + (2, 2, 32, 10) + >>> print(past[1].shape) + (2, 2, 10, 32) + >>> print(past[2].shape) + (2, 2, 32, 20) + >>> print(past[3].shape) + (2, 2, 20, 32) + """ + # @_LogActionOnce(logger=logger, key='TransformerDecoderLayer', + # no_warning=_get_parallel_mode() in (ParallelMode.STAND_ALONE,)) + @_args_type_validator_check(batch_size=Validator.check_positive_int, + hidden_size=Validator.check_positive_int, + num_heads=Validator.check_positive_int, + ffn_hidden_size=Validator.check_positive_int, + src_seq_length=Validator.check_positive_int, + tgt_seq_length=Validator.check_positive_int, + attention_dropout_rate=Validator.check_non_negative_float, + hidden_dropout_rate=Validator.check_non_negative_float, + hidden_act=_valid_type_checks([str], "TransformerDecoderLayer"), + post_layernorm_residual=Validator.check_bool, + layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16], + "TransformerDecoderLayer"), + softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16], + "TransformerDecoderLayer"), + param_init_type=_valid_value_checks([mstype.float32, mstype.float16], + "TransformerDecoderLayer"), + parallel_config=_valid_type_checks([OpParallelConfig, MoEParallelConfig], + "TransformerDecoderLayer"), + use_past=Validator.check_bool) + def __init__(self, hidden_size, + ffn_hidden_size, + num_heads, + batch_size, + src_seq_length, + tgt_seq_length, + attention_dropout_rate=0.1, + hidden_dropout_rate=0.1, + post_layernorm_residual=False, + use_past=False, + layernorm_compute_type=mstype.float32, + softmax_compute_type=mstype.float32, + param_init_type=mstype.float32, + hidden_act='gelu', + moe_config=default_moe_config, + parallel_config=default_dpmp_config): + super(TransformerDecoderLayerX, self).__init__() + _check_moe_config(moe_config, parallel_config) + self.use_moe = (moe_config.expert_num > 1) + config_to_attention = parallel_config.dpmp if self.use_moe else parallel_config + if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation(): + _check_config(parallel_config) + if num_heads % parallel_config.model_parallel != 0: + raise ValueError("For 'TransformerDecoderLayer', the class variable 'num_heads' must be divisibled by " + "'parallel_config.model_parallel', but got the num_heads is {} and " + "parallel_config.model_parallel is {}.".format(num_heads, + parallel_config.model_parallel)) + if hidden_size % parallel_config.model_parallel != 0: + raise ValueError( + "For 'TransformerDecoderLayer', the class variable 'hidden_size' must be divisibled by " + "'parallel_config.model_parallel', but got the hidden_size is {} and " + "parallel_config.model_parallel is {}." + .format(hidden_size, parallel_config.model_parallel)) + if ffn_hidden_size % parallel_config.model_parallel != 0: + raise ValueError("For 'TransformerDecoderLayer', the class variable 'ffn_hidden_size' must be " + "divisibled by 'parallel_config.model_parallel', but got the ffn_hidden_size is {} " + "and parallel_config.model_parallel is {}." + .format(ffn_hidden_size, parallel_config.model_parallel)) + if use_past: + raise ValueError(f"The {self.cls_name} does not support use_past=True.") + self.batch_size = batch_size + self.use_past = use_past + self.softmax_compute_type = softmax_compute_type + + self.src_seq_length = src_seq_length + self.tgt_seq_length = tgt_seq_length + self.use_past = use_past + self.hidden_size = hidden_size + + self.layernorm1 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type) + self.layernorm2 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type) + self.attention = MultiHeadAttentionX(hidden_size=hidden_size, + num_heads=num_heads, + batch_size=batch_size, + src_seq_length=tgt_seq_length, + tgt_seq_length=tgt_seq_length, + hidden_dropout_rate=hidden_dropout_rate, + attention_dropout_rate=attention_dropout_rate, + use_past=use_past, + softmax_compute_type=softmax_compute_type, + param_init_type=param_init_type, + parallel_config=config_to_attention) + + # Cross attention with the output of encoder as memory tensor + self.cross_attention = MultiHeadAttentionX(hidden_size=hidden_size, + num_heads=num_heads, + batch_size=batch_size, + src_seq_length=tgt_seq_length, + tgt_seq_length=src_seq_length, + hidden_dropout_rate=hidden_dropout_rate, + attention_dropout_rate=attention_dropout_rate, + softmax_compute_type=softmax_compute_type, + use_past=use_past, + param_init_type=param_init_type, + parallel_config=config_to_attention) + self.cross_attention_layernorm = _LayerNormX((hidden_size,)).to_float( + layernorm_compute_type) + + if self.use_moe: + self.output = MoE(hidden_size=hidden_size, + dropout_rate=hidden_dropout_rate, + ffn_hidden_size=ffn_hidden_size, + param_init_type=param_init_type, + hidden_act=hidden_act, + moe_config=moe_config, + parallel_config=parallel_config) + else: + # Feed Forward Network, FFN + self.output = FeedForwardX(hidden_size=hidden_size, + dropout_rate=hidden_dropout_rate, + ffn_hidden_size=ffn_hidden_size, + hidden_act=hidden_act, + param_init_type=param_init_type, + parallel_config=parallel_config) + self.post_layernorm_residual = post_layernorm_residual + self.add = P.Add() + self.add_3d = P.Add() + self.dtype = mstype.float16 + self.key_past = None + self.value_past = None + if self.use_past: + # operator used for state reuse + self.reducesum = P.ReduceSum().shard(((1, 1, 1, 1),)) + self.not_equal = P.NotEqual().shard(((1, 1, 1, 1), ())) + self.slice = P.StridedSlice().shard(((1, 1, 1, 1),)) + size_per_head = hidden_size // num_heads + self.key_shape = (batch_size, num_heads, size_per_head, tgt_seq_length) + self.value_shape = (batch_size, num_heads, tgt_seq_length, size_per_head) + # parameters saving key and value states + self.key_past = Parameter(Tensor(np.zeros(shape=self.key_shape), self.dtype), name="key_past") + self.value_past = Parameter(Tensor(np.zeros(shape=self.value_shape), self.dtype), name="value_past") + self.tile = P.Tile().shard(((1, 1),)) + self.mul = P.Mul().shard(((1, 1, 1, 1), (1,))) + self.assign = P.Assign().shard(((1, 1, 1, 1), (1, 1, 1, 1))) + elif _get_parallel_mode() not in (ParallelMode.AUTO_PARALLEL,): + _check_config(parallel_config) + if num_heads % parallel_config.model_parallel != 0: + raise ValueError("For 'TransformerDecoderLayer', the class variable 'num_heads' must be divisibled by " + "'parallel_config.model_parallel', but got the num_heads is {} and " + "parallel_config.model_parallel is {}.".format(num_heads, + parallel_config.model_parallel)) + if hidden_size % parallel_config.model_parallel != 0: + raise ValueError( + "For 'TransformerDecoderLayer', the class variable 'hidden_size' must be divisibled by " + "'parallel_config.model_parallel', but got the hidden_size is {} and " + "parallel_config.model_parallel is {}." + .format(hidden_size, parallel_config.model_parallel)) + if ffn_hidden_size % parallel_config.model_parallel != 0: + raise ValueError("For 'TransformerDecoderLayer', the class variable 'ffn_hidden_size' must be " + "divisibled by 'parallel_config.model_parallel', but got the ffn_hidden_size is {} " + "and parallel_config.model_parallel is {}." + .format(ffn_hidden_size, parallel_config.model_parallel)) + if use_past: + raise ValueError(f"The {self.cls_name} does not support use_past=True.") + self.batch_size = batch_size + self.use_past = use_past + self.softmax_compute_type = softmax_compute_type + + self.src_seq_length = src_seq_length + self.tgt_seq_length = tgt_seq_length + self.use_past = use_past + self.hidden_size = hidden_size + + self.layernorm1 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type) + self.layernorm1.shard(((parallel_config.data_parallel, 1),)) + self.layernorm2 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type) + self.layernorm2.shard(((parallel_config.data_parallel, 1),)) + self.attention = MultiHeadAttentionX(hidden_size=hidden_size, + num_heads=num_heads, + batch_size=batch_size, + src_seq_length=tgt_seq_length, + tgt_seq_length=tgt_seq_length, + hidden_dropout_rate=hidden_dropout_rate, + attention_dropout_rate=attention_dropout_rate, + use_past=use_past, + softmax_compute_type=softmax_compute_type, + param_init_type=param_init_type, + parallel_config=config_to_attention) + + # Cross attention with the output of encoder as memory tensor + self.cross_attention = MultiHeadAttentionX(hidden_size=hidden_size, + num_heads=num_heads, + batch_size=batch_size, + src_seq_length=tgt_seq_length, + tgt_seq_length=src_seq_length, + hidden_dropout_rate=hidden_dropout_rate, + attention_dropout_rate=attention_dropout_rate, + softmax_compute_type=softmax_compute_type, + use_past=use_past, + param_init_type=param_init_type, + parallel_config=config_to_attention) + self.cross_attention_layernorm = _LayerNormX((hidden_size,)).to_float( + layernorm_compute_type) + self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),)) + + if self.use_moe: + self.output = MoE(hidden_size=hidden_size, + dropout_rate=hidden_dropout_rate, + ffn_hidden_size=ffn_hidden_size, + param_init_type=param_init_type, + hidden_act=hidden_act, + moe_config=moe_config, + parallel_config=parallel_config) + else: + # Feed Forward Network, FFN + self.output = FeedForwardX(hidden_size=hidden_size, + dropout_rate=hidden_dropout_rate, + ffn_hidden_size=ffn_hidden_size, + hidden_act=hidden_act, + param_init_type=param_init_type, + parallel_config=parallel_config) + self.post_layernorm_residual = post_layernorm_residual + self.add = P.Add().shard(((parallel_config.data_parallel, 1), (parallel_config.data_parallel, 1))) + self.add_3d = P.Add().shard(((parallel_config.data_parallel, 1, 1), (parallel_config.data_parallel, 1, 1))) + self.dtype = mstype.float16 + self.key_past = None + self.value_past = None + if self.use_past: + # operator used for state reuse + self.reducesum = P.ReduceSum().shard(((1, 1, 1, 1),)) + self.not_equal = P.NotEqual().shard(((1, 1, 1, 1), ())) + self.slice = P.StridedSlice().shard(((1, 1, 1, 1),)) + size_per_head = hidden_size // num_heads + self.key_shape = (batch_size, num_heads, size_per_head, tgt_seq_length) + self.value_shape = (batch_size, num_heads, tgt_seq_length, size_per_head) + # parameters saving key and value states + self.key_past = Parameter(Tensor(np.zeros(shape=self.key_shape), self.dtype), name="key_past") + self.value_past = Parameter(Tensor(np.zeros(shape=self.value_shape), self.dtype), name="value_past") + self.tile = P.Tile().shard(((1, 1),)) + self.mul = P.Mul().shard(((1, 1, 1, 1), (1,))) + self.assign = P.Assign().shard(((1, 1, 1, 1), (1, 1, 1, 1))) + else: + raise RuntimeError(f"The {self.cls_name} only support sharding propagation or " + f"semi-auto parallel mode now.") + + def construct(self, hidden_stats, + decoder_mask, + encoder_output=None, + memory_mask=None, + init_reset=True, batch_valid_length=None): + self._check_input(hidden_stats, decoder_mask, encoder_output, memory_mask, init_reset, batch_valid_length) + # the returned shape is [bs, seq_length, embedding_size] or [bs * seq_length, embedding_size] + hidden_shape = F.shape(hidden_stats) + hidden_stats = F.reshape(hidden_stats, (-1, hidden_shape[-1])) + input_x = self.layernorm1(hidden_stats) + + input_x = F.cast(input_x, self.dtype) + # indicate whether reset saved states + key_reset = None + value_reset = None + if self.use_past: + # reset states, init_reset True for reuse and False for reset + key_reset = self.assign(self.key_past, self.mul(self.key_past, F.cast(init_reset, self.dtype))) + value_reset = self.assign(self.value_past, self.mul(self.value_past, F.cast(init_reset, self.dtype))) + # add dependency for desired execution order + input_x = F.depend(input_x, key_reset) + input_x = F.depend(input_x, value_reset) + + attention, layer_present = self.attention(input_x, input_x, input_x, decoder_mask, self.key_past, + self.value_past, batch_valid_length) + # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm + if self.post_layernorm_residual: + x = self.add(input_x, attention) + # For pre-layernorm the inputs for residual path are output of self-attention and input of this layer + else: + x = self.add(hidden_stats, attention) + middle_output = None + cross_attn_output = None + if encoder_output is not None: + middle_output = self.cross_attention_layernorm(x) + middle_output = F.cast(middle_output, self.dtype) + encoder_output = F.cast(encoder_output, self.dtype) + cross_attn_output, cross_layer_present = self.cross_attention(middle_output, encoder_output, + encoder_output, + memory_mask, self.key_past, + self.value_past, batch_valid_length) + layer_present += cross_layer_present + if self.post_layernorm_residual: + x = self.add(middle_output, cross_attn_output) + else: + x = self.add(x, cross_attn_output) + + output_x = self.layernorm2(x) + output_x = F.cast(output_x, self.dtype) + aux_loss = None + if self.use_moe: + mlp_logit, aux_loss = self.output(output_x) + else: + mlp_logit = self.output(output_x) + # return mlp_logit + + value_update = None + key_update = None + if self.use_past: + # current key and value + key_present, value_present = layer_present + # update key and value calculated this step + key_update = self.assign(self.key_past, key_present) + value_update = self.assign(self.value_past, value_present) + # add dependency for desired execution order + key_update = F.depend(key_update, key_reset) + value_update = F.depend(value_update, value_reset) + + # add dependency for desired execution order + mlp_logit = F.depend(mlp_logit, value_update) + mlp_logit = F.depend(mlp_logit, key_update) + + # if shape is 3d, we reshape the inputs of the add + if len(hidden_shape) == 3: + output_x = P.Reshape()(output_x, hidden_shape) + mlp_logit = P.Reshape()(mlp_logit, hidden_shape) + x = P.Reshape()(x, hidden_shape) + + if self.post_layernorm_residual: + output = self.add_3d(output_x, mlp_logit) + else: + output = self.add_3d(x, mlp_logit) + else: + if self.post_layernorm_residual: + output = self.add(output_x, mlp_logit) + else: + output = self.add(x, mlp_logit) + output = F.reshape(output, hidden_shape) + + if self.use_moe: + return output#, layer_present, aux_loss + return output#, layer_present + + def _check_input(self, hidden_states, attention_mask, encoder_output, memory_mask, init_reset, batch_valid_length): + r"""Check inputs""" + if not self.use_past or (self.use_past and self.is_first_iteration): + _check_shape_equal(F.shape(hidden_states), "hidden_states", self.cls_name, + [[self.batch_size, self.tgt_seq_length, self.hidden_size], + [self.batch_size * self.tgt_seq_length, self.hidden_size]]) + _check_shape_equal(F.shape(attention_mask), "attention_mask", self.cls_name, + [self.batch_size, self.tgt_seq_length, self.tgt_seq_length]) + + else: + _check_shape_equal(F.shape(hidden_states), "hidden_states", self.cls_name, + [self.batch_size, 1, self.hidden_size]) + _check_shape_equal(F.shape(attention_mask), "attention_mask", self.cls_name, + [self.batch_size, 1, self.tgt_seq_length]) + _check_input_dtype(F.dtype(hidden_states), "hidden_states", [mstype.float32, mstype.float16], self.cls_name) + _check_input_dtype(F.dtype(attention_mask), "attention_mask", [mstype.float32, mstype.float16], self.cls_name) + if encoder_output is not None: + _check_shape_equal(F.shape(encoder_output), "encoder_output", self.cls_name, + [[self.batch_size, self.src_seq_length, self.hidden_size], + [self.batch_size * self.src_seq_length, self.hidden_size]]) + _check_input_dtype(F.dtype(encoder_output), "encoder_output", + [mstype.float32, mstype.float16], self.cls_name) + if memory_mask is not None: + _check_shape_equal(F.shape(memory_mask), "memory_mask", self.cls_name, + [self.batch_size, self.tgt_seq_length, self.src_seq_length]) + _check_input_dtype(F.dtype(memory_mask), "memory_mask", + [mstype.float32, mstype.float16], self.cls_name) + + init_reset_is_tensor = isinstance(init_reset, Tensor) + init_reset_is_default = init_reset is True + batch_valid_length_is_tensor = isinstance(batch_valid_length, Tensor) + batch_is_default = batch_valid_length is None + _check_past_none_input_none(self.use_past, "init_reset", self.cls_name, True, init_reset_is_tensor, + init_reset_is_default) + _check_past_none_input_none(self.use_past, "batch_valid_length", self.cls_name, None, + batch_valid_length_is_tensor, batch_is_default) + + if self.use_past: + _check_shape_equal(F.shape(init_reset), "init_reset", self.cls_name, [1]) + _check_input_dtype(F.dtype(init_reset), "init_reset", [mstype.bool_], self.cls_name) + _check_shape_equal(F.shape(batch_valid_length), "batch_valid_length", self.cls_name, [self.batch_size]) + _check_input_dtype(F.dtype(batch_valid_length), "batch_valid_length", [mstype.int32], self.cls_name) + return True + + diff --git a/trc/transformer/T5/transformer.py b/trc/transformer/T5/transformer.py index a369b9d0aeddd4c9f8d0d8330dd993fbd2320c38..84d11f1298dda18a2b79553243b344d6907f2629 100644 --- a/trc/transformer/T5/transformer.py +++ b/trc/transformer/T5/transformer.py @@ -407,7 +407,7 @@ class FeedForward(Cell): @_args_type_validator_check(hidden_size=Validator.check_positive_int, ffn_hidden_size=Validator.check_positive_int, dropout_rate=Validator.check_non_negative_float, - hidden_act=_valid_type_checks([str], "FeedForward"), + # hidden_act=_valid_type_checks([str], "FeedForward"), param_init_type=_valid_value_checks([mstype.float32, mstype.float16], "FeedForward"), parallel_config=_valid_type_checks([OpParallelConfig], @@ -415,13 +415,14 @@ class FeedForward(Cell): def __init__(self, hidden_size, ffn_hidden_size, dropout_rate, - hidden_act='gelu', + hidden_act=None, has_bias=True, expert_num=1, param_init_type=mstype.float32, parallel_config=default_dpmp_config): super(FeedForward, self).__init__() _check_config(parallel_config) + self.dtype = param_init_type dp = parallel_config.data_parallel mp = parallel_config.model_parallel if ffn_hidden_size % mp != 0: @@ -480,7 +481,7 @@ class FeedForward(Cell): def construct(self, x): _check_input_shape(F.shape(x), "x", self.cls_name, [2, 3]) _check_input_dtype(F.dtype(x), "x", [mstype.float32, mstype.float16], self.cls_name) - x = self.cast(x, mstype.float16) + x = self.cast(x, self.dtype) # returned shape is [bs, seq_length, ffn_hidden_size] or [bs * seq_length, ffn_hidden_size] hidden = self.mapping(x) output = self.projection(hidden) @@ -794,7 +795,6 @@ class MultiHeadAttention(Cell): tgt_seq_length, hidden_size, num_heads, - # app, hidden_dropout_rate=0.1, attention_dropout_rate=0.1, compute_dtype=mstype.float32, @@ -804,7 +804,8 @@ class MultiHeadAttention(Cell): use_past=False, is_decoder=False, has_relative_attention_bias=False, - parallel_config=default_dpmp_config): + parallel_config=default_dpmp_config, + num_outputs=1): super(MultiHeadAttention, self).__init__() _check_config(parallel_config) self.is_parallel_mode = _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) @@ -812,7 +813,7 @@ class MultiHeadAttention(Cell): self.tgt_seq_length = tgt_seq_length self.hidden_size = hidden_size self.batch_size = batch_size - # self.app=app + self.num_outputs = num_outputs if hidden_dropout_rate < 0 or hidden_dropout_rate >= 1: raise ValueError("For 'MultiHeadAttention', the class variable 'hidden_dropout_rate' must be " "in range [0, 1.0), but got the value : {}.".format(hidden_dropout_rate)) @@ -1017,11 +1018,11 @@ class MultiHeadAttention(Cell): output = self.projection(attention) output = self.dropout(output) output = F.reshape(output, ori_shape) - # if self.app=="trc": - # return output, layer_present, position_bias + if self.num_outputs==1: + return output + return output, layer_present, position_bias # else: - # return output - return output + # return output def _check_inputs(self, query_tensor, key_tensor, value_tensor, attention_mask, key_past=None, value_past=None, batch_valid_length=None): @@ -1322,7 +1323,7 @@ class TransformerEncoderLayer(Cell): seq_length=Validator.check_positive_int, attention_dropout_rate=Validator.check_non_negative_float, hidden_dropout_rate=Validator.check_non_negative_float, - hidden_act=_valid_type_checks([str], "TransformerEncoderLayer"), + # hidden_act=_valid_type_checks([str], "TransformerEncoderLayer"), post_layernorm_residual=Validator.check_bool, layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16], "TransformerEncoderLayer"), @@ -1388,7 +1389,8 @@ class TransformerEncoderLayer(Cell): use_past=use_past, is_decoder=False, has_relative_attention_bias=has_relative_attention_bias, - parallel_config=parallel_config) + parallel_config=parallel_config, + num_outputs=3) _check_moe_config(moe_config, parallel_config) self.use_moe = (moe_config.expert_num > 1) if self.use_moe is True: @@ -1430,7 +1432,7 @@ class TransformerEncoderLayer(Cell): self.mul = P.Mul().shard(((1, 1, 1, 1), (1,))) self.assign = P.Assign().shard(((1, 1, 1, 1), (1, 1, 1, 1))) - def construct(self, x, input_mask, init_reset=True, batch_valid_length=None, position_bias=None): + def construct(self, x, input_mask, position_bias=None, init_reset=True, batch_valid_length=None): self._check_input(x, input_mask, init_reset, batch_valid_length) x_shape = F.shape(x) x = F.reshape(x, (-1, x_shape[-1])) @@ -1449,8 +1451,8 @@ class TransformerEncoderLayer(Cell): input_x = F.depend(input_x, key_reset) input_x = F.depend(input_x, value_reset) - attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, input_mask, - self.key_past, self.value_past, batch_valid_length, position_bias) + attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, input_mask, position_bias, + self.key_past, self.value_past, batch_valid_length) # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm if self.post_layernorm_residual: x = self.add(input_x, attention) @@ -1465,6 +1467,7 @@ class TransformerEncoderLayer(Cell): mlp_logit, aux_loss = self.output(output_x) else: mlp_logit = self.output(output_x) + # return mlp_logit value_update = None key_update = None @@ -1500,8 +1503,8 @@ class TransformerEncoderLayer(Cell): output = F.reshape(output, x_shape) if self.use_moe is True: - return output, layer_present, aux_loss - return output, layer_present, position_bias + return output#, layer_present, aux_loss + return output#, layer_present, position_bias def _check_input(self, x, input_mask, init_reset, batch_valid_length): r"""Check inputs""" @@ -1628,7 +1631,7 @@ class TransformerDecoderLayer(Cell): tgt_seq_length=Validator.check_positive_int, attention_dropout_rate=Validator.check_non_negative_float, hidden_dropout_rate=Validator.check_non_negative_float, - hidden_act=_valid_type_checks([str], "TransformerDecoderLayer"), + # hidden_act=_valid_type_checks([str], "TransformerDecoderLayer"), post_layernorm_residual=Validator.check_bool, layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16], "TransformerDecoderLayer"), @@ -1685,6 +1688,7 @@ class TransformerDecoderLayer(Cell): self.hidden_size = hidden_size self.layernorm1 = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type) + self.layernorm1.shard(((parallel_config.data_parallel, 1),)) self.layernorm2 = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type) self.layernorm2.shard(((parallel_config.data_parallel, 1),)) @@ -1702,7 +1706,8 @@ class TransformerDecoderLayer(Cell): param_init_type=param_init_type, is_decoder=True, has_relative_attention_bias=has_relative_attention_bias, - parallel_config=parallel_config) + parallel_config=parallel_config, + num_outputs=3) # Cross attention with the output of encoder as memory tensor self.cross_attention = MultiHeadAttention(hidden_size=hidden_size, num_heads=num_heads, @@ -1717,8 +1722,10 @@ class TransformerDecoderLayer(Cell): param_init_type=param_init_type, is_decoder=True, has_relative_attention_bias=has_relative_attention_bias, - parallel_config=parallel_config) + parallel_config=parallel_config, + num_outputs=3) self.cross_attention_layernorm = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type) + self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),)) _check_moe_config(moe_config, parallel_config) self.use_moe = (moe_config.expert_num > 1) @@ -1742,7 +1749,7 @@ class TransformerDecoderLayer(Cell): self.post_layernorm_residual = post_layernorm_residual self.add = P.Add().shard(((parallel_config.data_parallel, 1), (parallel_config.data_parallel, 1))) self.add_3d = P.Add().shard(((parallel_config.data_parallel, 1, 1), (parallel_config.data_parallel, 1, 1))) - self.dtype = mstype.float16 + self.dtype = mstype.float32 self.key_past = None self.value_past = None if self.use_past: @@ -1764,15 +1771,16 @@ class TransformerDecoderLayer(Cell): decoder_mask, encoder_output=None, memory_mask=None, - init_reset=True, batch_valid_length=None, - position_bias=None, encoder_decoder_position_bias=None): - #self._check_input(hidden_stats, decoder_mask, encoder_output, memory_mask, init_reset, batch_valid_length) + position_bias=None, encoder_decoder_position_bias=None, + init_reset=True, batch_valid_length=None): + # self._check_input(hidden_stats, decoder_mask, encoder_output, memory_mask, init_reset, batch_valid_length) # the returned shape is [bs, seq_length, embedding_size] or [bs * seq_length, embedding_size] hidden_shape = F.shape(hidden_stats) hidden_stats = F.reshape(hidden_stats, (-1, hidden_shape[-1])) input_x = self.layernorm1(hidden_stats) input_x = F.cast(input_x, self.dtype) - + init_reset = True + batch_valid_length=None # indicate whether reset saved states key_reset = None value_reset = None @@ -1784,23 +1792,25 @@ class TransformerDecoderLayer(Cell): input_x = F.depend(input_x, key_reset) input_x = F.depend(input_x, value_reset) - attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, decoder_mask, self.key_past, - self.value_past, batch_valid_length, position_bias) + attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, decoder_mask, position_bias, self.key_past, + self.value_past, batch_valid_length) + # return attention # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm if self.post_layernorm_residual: x = self.add(input_x, attention) # For pre-layernorm the inputs for residual path are output of self-attention and input of this layer else: x = self.add(hidden_stats, attention) - middle_output = None + cross_attn_output = None if encoder_output is not None: middle_output = self.cross_attention_layernorm(x) middle_output = F.cast(middle_output, self.dtype) cross_attn_output, cross_layer_present, encoder_decoder_position_bias = self.cross_attention(middle_output, encoder_output, encoder_output, - memory_mask, self.key_past, - self.value_past, batch_valid_length, encoder_decoder_position_bias) + memory_mask, encoder_decoder_position_bias, self.key_past, + self.value_past, batch_valid_length) + # return cross_attn_output layer_present += cross_layer_present if self.post_layernorm_residual: x = self.add(middle_output, cross_attn_output) @@ -1849,8 +1859,8 @@ class TransformerDecoderLayer(Cell): output = F.reshape(output, hidden_shape) if self.use_moe is True: - return output, layer_present, aux_loss - return output, layer_present, position_bias, encoder_decoder_position_bias + return output#, layer_present, aux_loss + return output#, layer_present, position_bias, encoder_decoder_position_bias def _check_input(self, hidden_states, attention_mask, encoder_output, memory_mask, init_reset, batch_valid_length): r"""Check inputs""" diff --git a/trc/transformer/build.sh b/trc/transformer/build.sh index e8b8e2e2c65c93b5f149f2a6698572a2e0a0f67a..72ddec2dc44955d8594f3c199179bc1d22030ac6 100755 --- a/trc/transformer/build.sh +++ b/trc/transformer/build.sh @@ -13,7 +13,7 @@ MSLITE_ENABLE_TESTCASES=off \ MSLITE_ENABLE_GPU=on \ MSLITE_MINDDATA_IMPLEMENT=full \ MSLITE_GPU_BACKEND=tensorrt \ -MSLITE_GPU_ARCH=60 \ +MSLITE_GPU_ARCH=80 \ TENSORRT_PATH=/usr/lib/x86_64-linux-gnu \ MSLIBS_SERVER=localHost \ ${base}/build.sh -I x86_64 $@ diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 2f318d6c2a4a39ce9bb3e5f0f468558cb8bc4503..8f66f66f642dcf01e140a25443f2b329a3994fad 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,3 +1,3 @@ [gpu_context] -input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128] +input_shape=input_ids:[1,20];token_type_ids:[1,20];input_mask:[1,20] diff --git a/trc/transformer/convert_fp32.sh b/trc/transformer/convert_fp32.sh index c0bb22e19b83340b212a144a5fdebe23ac146925..95c5bdcf6c48a5c1603b883865acd089d51e5eab 100755 --- a/trc/transformer/convert_fp32.sh +++ b/trc/transformer/convert_fp32.sh @@ -1,3 +1,4 @@ +#!/bin/bash base=`git rev-parse --show-toplevel` version=$(cat ${base}/version.txt) fusion=true @@ -9,8 +10,8 @@ while getopts "n" opt ; do echo "Unknown option ${opt}!" ;; esac done -if [ "${fusion}" = "true" ]; then - optimize="--optimizeTransformer=true" +if [ "${fusion}" = "false" ]; then + optimize="--optimizeTransformer=false" fi shift $(($OPTIND - 1)) file_name=$(basename $1) @@ -41,6 +42,5 @@ ${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64/t --modelFile=$1 \ --outputFile=${base}/trc/transformer/convv_${file_name} \ --configFile=${base}/trc/transformer/t.config \ - --encryption=false \ - ${optimize} + --encryption=false fi diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index a3cba44dd0cbea0b85c3591e61aa2c1caf038a62..cc0cfcebbb1a84e39f3683fa5b2211d455e8a1fd 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -13,13 +13,13 @@ system = f'{base}/trc/system_test/release/ubuntu_x86/mindspore-lite-{version}-li benchmark = f'{system}/tools/benchmark' work_dir=f'{base}/trc/transformer' image = "private_transformer:0.1" -server = "10.10.10.174" +server = "caspi" enable_fp16 = "false" suffix="fp32" usage='enter the correct parameters: app=ch\\trc, act=runtime\\be, loop count=int>=0, server=local\\num of server\nif app=trc and act=be loop count must be 1' app='ch' act='be' -cuda_visible_dev=3 +cuda_visible_dev=6 loop_count=1 if len(sys.argv)>2 or len(sys.argv)==1: parameters=sys.argv[1:] @@ -53,20 +53,26 @@ for i in range(len(parameters)) : print('loop count=',loop_count) inputs_file = open("models.txt") models_arg = inputs_file.readlines() -# import subprocess def find_output_name(ms_model, output_file): - output_name = os.popen(f"../readers/flatbuf/readfb {ms_model} -O").read() - print(output_name) - output_name = output_name[:-1] - print(output_name) + os.system(f"../readers/flatbuf/readfb {ms_model} > readmodel.txt") + file = open('readmodel.txt', 'r') + lines = file.readlines() + file.close() + line_of_output = [i for i,s in enumerate(lines) if "outputs:#" in s][0] + outputs = lines[line_of_output+1].split() + outpus_name=[] + for out in outputs: + output = [i for i,s in enumerate(lines) if "tensor #"+out in s][0] + output_name = lines[output+2].split()[2] + outpus_name.append(output_name) with open(output_file, 'r') as file: data = file.read() - for i,out in enumerate(output_name.split()): - print(out) - data = data.replace('output'+str(i+1), out) + for i,out1 in enumerate(outpus_name): + data = data.replace('output'+str(i+1), out1) with open(output_file, 'w') as file: file.write(data) - print(output_name) + print(outpus_name) +numcount=0 for line_model_arg in models_arg: if line_model_arg[0] == '#' or line_model_arg == '\n': continue line_model_arg=line_model_arg[:-1] @@ -92,13 +98,12 @@ for line_model_arg in models_arg: if batch_size!='1': model_name+=batch_size os.system(f"rm -f {base}/trc/transformer/{model_name}* {base}/trc/transformer/convv_{model_name}*") + os.system(f"cp /home/batya/git-proj/transformer_repo/transformer/models/t5/T5Transformer.py .") ret = os.system(f"docker run --user \"$(id -u):$(id -g)\" -w {base}/trc/transformer --runtime=nvidia -v {base}/../:{base}/../ -v /opt/share:/opt/share --privileged=true {image} python {base}/trc/transformer/train_transformer_export.py {line_model_arg} " ) ret=0 if ret != 0: exit() input_files='' output_file='' - # os.system(f"./convert_fp32.sh {model_name}_fwd.mindir") - # find_output_name(f'convv_{model_name}_fwd.mindir', f'{model_name}_output.txt') if app=='ch': ret=0 if act == 'be': @@ -113,19 +118,24 @@ for line_model_arg in models_arg: os.system('./trc/release.sh x86') os.system(f"cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}" ) else: - - with open(f'cfg_{model_name}.config','w') as f: - if model_name == 'bert': - f.write(f"[gpu_context]\ninput_shape=input_ids:[{batch_size},{seq}];token_type_ids:[{batch_size},{seq}];input_mask:[{batch_size},{seq}]") - elif model_name == 'transformer_encoder_layer': - f.write(f"[gpu_context]\ninput_shape=x:[{batch_size},{seq},{hidden_size}];input_mask:[{batch_size},{seq},{seq}]") + if model_name in ['bert','transformer_encoder_layer']: + with open(f'cfg_{model_name}.config','w') as f: + if model_name == 'bert': + f.write(f"[gpu_context]\ninput_shape=input_ids:[{batch_size},{seq}];token_type_ids:[{batch_size},{seq}];input_mask:[{batch_size},{seq}]") + elif model_name == 'transformer_encoder_layer': + f.write(f"[gpu_context]\ninput_shape=x:[{batch_size},{seq},{hidden_size}];input_mask:[{batch_size},{seq},{seq}]") os.system(f"ssh {server} 'rm -f {system}/../mindspore-lite-{version}-linux-x64.tar.gz {work_dir}/*{model_name}*'") os.system(f"ssh {server} 'mkdir -p {benchmark}'") os.system(f"rsync -v {system}/../mindspore-lite-{version}-linux-x64.tar.gz {server}:{system}/..") os.system(f"ssh {server} 'cd {system}/.. && tar -xzf {system}/../mindspore-lite-{version}-linux-x64.tar.gz'") os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") - #os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) + os.system(f"./deploy.sh convv_{model_name}_fwd.mindir run") + # os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) + # os.system(f"mkdir {base}/trc/transformer/{model_name}{numcount}") + # os.system(f"cp {base}/trc/transformer/{model_name}* {base}/trc/transformer/{model_name}{numcount}/") + numcount+=1 + elif app=='trc': #if loop count =1 app=be else app = runtime @@ -140,9 +150,9 @@ for line_model_arg in models_arg: else: print("run trc caspi") print("line model arg=", line_model_arg) - os.system(f"ssh {server} 'rm -f {base}/../FasterTransformer/build/bin/ms_benchmark {base}/../FasterTransformer/build/bin/{model_name}*'") + os.system(f"ssh {server} 'rm -f {base}/../FasterTransformer/build/bin/ms_benchmark {base}/../FasterTransformer/build/bin/*{model_name}*'") os.system(f"rsync -v {base}/../FasterTransformer/build/bin/ms_benchmark {server}:{base}/../FasterTransformer/build/bin/ms_benchmark" ) - os.system(f"rsync -v {base}/trc/transformer/{model_name}* {server}:{base}/../FasterTransformer/build/bin" ) + os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/../FasterTransformer/build/bin" ) os.system(f'rsync -v {base}/../FasterTransformer/build/lib/libtransformer-shared.so caspi:{base}/../FasterTransformer/build/lib/.') os.system(f"ssh {server} 'cd {base}/../FasterTransformer/build/bin && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={base}/../FasterTransformer:/usr/local/cuda-11.7/lib64 ./ms_benchmark {line_model_arg}' " ) diff --git a/trc/transformer/get_output_by_mindir.py b/trc/transformer/get_output_by_mindir.py old mode 100644 new mode 100755 diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 41bbc1106f85786ba64d14b0e891aa08e625b9ca..b69028b155b878826aae6f5042efd36ed0908c41 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,28 +1,77 @@ +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 + +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 + +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 + +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 + + +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer #run the following tests before push -#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 + +-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross -#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 -#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross + #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert + +#-b 1 -l 3 -H 12 -S 768 -s 128 -m T5 + +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -f 3072 -x 0 -m transformer_encoder_layer_t5 + +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 + +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer + #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 +#-b 1 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert +#-b 8 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert +#-b 16 -l 24 -H 16 -S 1024 -s 128 -P 0 -m bert +#-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -H 4 -S 1024 -p 0 -m mha_x1 -#-b 1 -l 6 -s 128 -H 8 -S 1024 -m T5 +#-b 1 -l 2 -s 12 -t 12 -H 2 -S 4 -m transformer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 4 -S 512 -s 128 -f 3072 -P 1 -m transformer_encoder_layer -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer -#-b 4 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer -#-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer +#-b 4 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_encoder_layer +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5 +#-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_encoder_layer_t5 +# +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_decoder_layer +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_decoder_layer_t5 +#-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer_t5 +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer #-b 1 -l 12 -H 4 -S 512 -s 128 -P 1 -f 3072 -m transformer_encoder_layer #-b 4 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer @@ -39,7 +88,7 @@ #-b 1 -l 66 -s 20 -t 30 -H 3 -S 15 -p 0 -m mha_cross #-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 #-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross --b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 8 -l 12 -H 4 -S 512 -s 128 -P 0 -f 3072 -m transformer_encoder_layer #-b 16 -l 16 -H 8 -S 1024 -s 64 -P 1 -f 1024 -m transformer_encoder_layer #-b 32 -l 12 -H 4 -S 512 -s 128 -P 0 -f 3072 -m transformer_encoder_layer @@ -49,10 +98,10 @@ #-b 8 -l 12 -H 4 -S 512 -s 64 -m bert # ----------------------------------------------------------------------------- -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P False -m transformer_encoder_layer -#-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert -#-b 1 -l 12 -H 2 -S 8 -s 20 -m T5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert +#-b 1 -l 2 -H 2 -S 2 -s 128 -m T5 +#-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P 1 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -m bert #-b 8 -l 12 -H 12 -S 768 -s 128 -m bert @@ -60,7 +109,7 @@ #-b 32 -l 12 -H 12 -S 768 -s 128 -m bert #num=12 head_size=64 -#-b 16 -l 12 -H 16 -S 512 -s 128 -m bert +#-b 16 -l 12 -H 12 -S 768 -s 128 -m bert #-b 8 -l 12 -H 12 -S 768 -s 128 -m bert #-b 1 -l 12 -H 12 -S 768 -s 64 -m bert #-b 1 -l 24 -H 16 -S 1024 -s 128 -m bert @@ -68,7 +117,7 @@ #-b 32 -l 24 -H 16 -S 1024 -s 128 -m bert #-b 8 -l 24 -H 16 -S 1024 -s 128 -m bert # -s 64 128 512 1024 -# -b 1 16 32 +#-b 1 16 32 ##-s 128 -H 4 8 -S 1024 2048 #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m test #-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m test -T fp32 -W fp32 -F fp32 @@ -81,24 +130,17 @@ #-b 1 -l 66 -s 256 -H 12 -S 768 -p 0 -m mha_x1 -T fp16 #-b 8 -l 66 -s 256 -H 12 -S 768 -t 512 -p 0 -m mha_cross -T fp16 #-b 8 -l 66 -s 9 -H 12 -S 768 -t 21 -p 0 -m mha_cross -# -T fp16 -#-b 1 -l 2 -s 128 -H 8 -S 1024 -m T5 -#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 -T fp16 -W fp32 -F fp32 -#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 -#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross - -#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32 -#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -T fp16 - -#-b 1 -l 66 -s 2 -H 96 -S 12288 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32 -#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32 -#-b 1 -l 66 -s 128 -t 64 -H 12 -S 768 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32 +#-b 1 -l 1 -s 20 -H 2 -S 4 -m bert +# T5 tests +#-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m T5 +#-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m T5 +# +#-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m T5 +#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m T5 -#-b 1 -l 66 -s 128 -t 64 -H 12 -S 768 -p 0 -m mha_cross -#-b 5 -l 66 -s 256 -H 4 -S 1024 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32 -#-b 8 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -T fp16 -#-b 8 -l 66 -s 256 -H 12 -S 768 -p 0 -m mha_x1 -T fp16 -#-b 8 -l 66 -s 256 -H 8 -S 512 -p 0 -m mha_x1 -T fp16 -#-b 8 -l 66 -s 64 -H 8 -S 512 -p 0 -m mha_x1 -T fp16 -#-b 1 -l 66 -s 256 -H 12 -S 768 -p 0 -m mha_x1 -T fp16 +# transformer tests +#-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m transformer +#-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m transformer +#-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m transformer +#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m transformer diff --git a/trc/transformer/t.config b/trc/transformer/t.config index d4450ed345c54537a2c402e91c33a785af8ab5f1..0fad133d432b210e3d49d70c6a36f480ff877951 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,3 +1,5 @@ [registry] -#fusion_blacklists="MultiHeadAttentionFusion","EncoderLayerFusion" - +#fusion_blacklists="MultiHeadAttentionFusion" +#fusion_blacklists="EncoderLayerFusion","DecoderLayerFusion" +#fusion_blacklists="DecoderLayerFusion" +#fusion_blacklists="EncoderLayerFusion" diff --git a/trc/transformer/test_tr.py b/trc/transformer/test_tr.py old mode 100644 new mode 100755 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index af10b53c392a146b936ea7ce71de2b2c27ad596b..275aeaf906b1813febedeab1edcbf3a60bc7a342 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -11,7 +11,8 @@ model_zoo_path=os.environ['CLOUD_MODEL_ZOO'] sys.path.append(model_zoo_path) sys.path.append("../../../transformer/transformer/models") sys.path.append("./T5") -from MultiHeadTester import MultiHeadAttentionX,TransformerEncoderLayerX,FeedForwardX +from MultiHeadTester import MultiHeadAttentionX, TransformerDecoderLayerX,TransformerEncoderLayerX,FeedForwardX +import T5Transformer as T from mindspore.common.parameter import Parameter from mindspore.common.initializer import Tensor import mindspore as M @@ -53,8 +54,6 @@ M.context.set_context(mode=M.context.GRAPH_MODE,device_target="GPU", save_graphs # # y = model(encoder_input_value, encoder_input_mask)# _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask) # for i in range (2): # y = model(encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask, memory_mask) - -# # print("y=", y) # export(model, encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask, memory_mask, file_name= name + "_fwd", file_format='MINDIR') def get_gpu_memory(): @@ -106,6 +105,7 @@ eps2=1e-6 post_layernorm=True ffn_hidden_size=-1 app="ch" +ffn_fp16 = False compress = False def read_args(): global batch @@ -124,6 +124,7 @@ def read_args(): global in_type global w_type global app + global ffn_fp16 global compress print("sys argv = ", sys.argv) for i in range(len(sys.argv)) : @@ -213,7 +214,14 @@ def read_args(): print("error: illegal compute type {}".format(sys.argv[i + 1]) ) else: app = sys.argv[i + 1] - print("app=",app) + elif sys.argv[i] == '-x': + if sys.argv[i + 1] not in ["0", "1"]: + print("error: illegal compute type {}".format(sys.argv[i + 1]) ) + else: + if sys.argv[i + 1]=='0': + ffn_fp16 = False + elif sys.argv[i + 1]=='1': + ffn_fp16 = True elif sys.argv[i] == '-c': if sys.argv[i+1] == 'true': compress = True @@ -259,7 +267,6 @@ def calc_seq_lengths(batch, max_seq_length,th): return seq_len def transformer_encoder_layer_create(): - post_layernorm=False name = "transformer_encoder_layer" if (post_layernorm): model = TransformerEncoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, @@ -320,11 +327,11 @@ def transformer_encoder_layer_create(): saveT(bp, name + "_weight6.fp" + suffix) saveT(gl2, name + "_weight7.fp" + suffix) saveT(bl2, name + "_weight8.fp" + suffix) - if app == 'trc': + if ffn_fp16 == True: saveTensorToHalf(omw, name + "_weight9.fp" + "16") saveTensorToHalf(omb, name + "_weight10.fp" + "16") saveTensorToHalf(opw, name + "_weight11.fp" + "16") - elif app == 'ch': + else: saveT(omw, name + "_weight9.fp" + suffix) saveT(omb, name + "_weight10.fp" + suffix) saveT(opw, name + "_weight11.fp" + suffix) @@ -343,11 +350,260 @@ def transformer_encoder_layer_create(): y = pruneTensor(y,seq_len,1) saveCalib(out_name, np.array(y), f_y) print("y.shape",np.array(y).shape) + f_y.close() # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims elif app=="trc": saveT(y, name + "_output1.fp" + suffix) +def transformer_encoder_layer_t5_create(): + name = "transformer_encoder_layer_t5" + if (post_layernorm): + print("post_layernorm") + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, post_layernorm_residual=True, has_bias=False, hidden_act='relu') + else: + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, has_bias=False, hidden_act='relu') + encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + # encoder_input_value = M.Tensor(np.zeros((batch, seq, hid_size)), M.float32) + # encoder_input_mask = M.Tensor(np.zeros((batch, seq, seq)), M.float32) + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() + + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + wp = model.attention.projection.weight + omw = model.output.mapping.weight + opw = model.output.projection.weight + gl1 = model.layernorm1.weight + gl2 = model.layernorm2.weight + + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(encoder_input_value, name + "_input1.fp" + suffix) + saveT(encoder_input_mask, name + "_input2.fp" + suffix) + saveT(pos, name + "_input3.fp" + suffix) + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(wt, name + "_weight2.fp" + suffix) + saveT(wp, name + "_weight3.fp" + suffix) + saveT(gl2, name + "_weight4.fp" + suffix) + if ffn_fp16 == True: + saveTensorToHalf(omw, name + "_weight5.fp" + "16") + saveTensorToHalf(opw, name + "_weight6.fp" + "16") + else: + saveT(omw, name + "_weight5.fp" + suffix) + saveT(opw, name + "_weight6.fp" + suffix) + _cell_graph_executor.compile(model, + encoder_input_value, + encoder_input_mask,pos) + y = model(encoder_input_value, encoder_input_mask,pos) + print('name=',name) + export(model, encoder_input_value, encoder_input_mask,pos, file_name= name + "_fwd", file_format='MINDIR') + # if app=="ch": + f_y=open(f'./{name}_output.txt','w') + out_name='output1' + print("name output:",out_name) + saveCalib(out_name, np.array(y), f_y) + print("y.shape",np.array(y).shape) + # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims + f_y.close() + # elif app=="trc": + saveT(y, name + "_output1.fp" + suffix) + + +def transformer_decoder_layer_t5_create(): + name = "transformer_decoder_layer_t5" + if (post_layernorm): + print("post_layernorm true") + model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False, hidden_act="relu") + else: + print("post_layernorm false") + model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False, hidden_act="relu") + hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) + decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) + pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + encoder_pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + actual_seq = seq // 2 + if compress: + input_value = hidden_stats.asnumpy() + input_value[:,actual_seq:,:] = 0 + hidden_stats = M.Tensor.from_numpy(input_value) + decoder_input_mask_value = decoder_mask.asnumpy() + decoder_input_mask_value[:,:,actual_seq:] = 0 + decoder_mask = M.Tensor.from_numpy(decoder_input_mask_value) + encoder_output_value = encoder_output.asnumpy() + encoder_output_value[:,:,actual_seq:] = 0 + encoder_output = M.Tensor.from_numpy(encoder_output_value) + memory_mask_value = memory_mask.asnumpy() + memory_mask_value[:,:,actual_seq:] = 0 + memory_mask = M.Tensor.from_numpy(memory_mask_value) + pos_value = pos.asnumpy() + pos_value[:,:,actual_seq:] = 0 + pos = M.Tensor.from_numpy(pos_value) + encoder_pos_value = encoder_pos.asnumpy() + encoder_pos_value[:,:,actual_seq:] = 0 + encoder_pos = M.Tensor.from_numpy(encoder_pos_value) + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() + + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + wp = model.attention.projection.weight + + qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size + k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() + v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() + + w2 = np.concatenate((k2, v2)) # 2xhid_size x hid_size + w2 = w2.transpose() # hid_size x 2xhid_size + wt2 = M.Tensor(w2, w_compute_type) + wp2 = model.cross_attention.projection.weight + omw = model.output.mapping.weight + print('omw.asnumpy().shape',omw.asnumpy().shape) + opw = model.output.projection.weight + + gl1 = model.layernorm1.weight + gl2 = model.layernorm2.weight + gl3 = model.cross_attention_layernorm.weight + + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(wt, name + "_weight2.fp" + suffix) + saveT(wp, name + "_weight3.fp" + suffix) + saveT(gl2, name + "_weight4.fp" + suffix) + saveT(qt2, name + "_weight5.fp" + suffix) + saveT(wt2, name + "_weight6.fp" + suffix) + saveT(wp2, name + "_weight7.fp" + suffix) + saveT(gl3, name + "_weight8.fp" + suffix) + if(ffn_fp16): + saveTensorToHalf(omw, name + "_weight9.fp" + "16") + saveTensorToHalf(opw, name + "_weight10.fp" + "16") + else: + saveT(omw, name + "_weight9.fp" + suffix) + saveT(opw, name + "_weight10.fp" + suffix) + saveT(hidden_stats, name + "_input1.fp" + suffix) + saveT(decoder_mask, name + "_input2.fp" + suffix) + saveT(encoder_output, name + "_input3.fp" + suffix) + saveT(memory_mask, name + "_input4.fp" + suffix) + saveT(pos, name + "_input5.fp" + suffix) + saveT(encoder_pos, name + "_input6.fp" + suffix) + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , pos, encoder_pos) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos, file_name= name + "_fwd", file_format='MINDIR') + if compress: + y_num = y.asnumpy() + y_num[:,actual_seq:,:] = 0 + y = M.Tensor.from_numpy(y_num) + f_y=open(f'./{name}_output.txt','w') + saveCalib("output1", np.array(y), f_y)#2 dims + f_y.close() + saveT(y, name + "_output1.fp" + suffix) + +def transformer_decoder_layer_create(): + name = "transformer_decoder_layer" + if (post_layernorm): + print("post_layernorm true") + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True) + else: + print("post_layernorm false") + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num) + hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) + decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() + + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + bq = model.attention.dense1.bias.asnumpy() + bk = model.attention.dense2.bias.asnumpy() + bv = model.attention.dense3.bias.asnumpy() + bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + bt =M.Tensor(bw, w_compute_type) + wp = model.attention.projection.weight + bp = model.attention.projection.bias + + qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size + k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() + v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() + + w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size + w2 = w2.transpose() # hid_size x 3xhid_size + wt2 = M.Tensor(w2, w_compute_type) + bq2 = model.cross_attention.dense1.bias.asnumpy() + bk2 = model.cross_attention.dense2.bias.asnumpy() + bv2 = model.cross_attention.dense3.bias.asnumpy() + bw2 = np.concatenate((bq2, bk2, bv2)) #(3xhid) X 1 + bt2 =M.Tensor(bw2, w_compute_type) + wp2 = model.cross_attention.projection.weight + bp2 = model.cross_attention.projection.bias + omw = model.output.mapping.weight + opw = model.output.projection.weight + omb = model.output.mapping.bias + opb = model.output.projection.bias + + gl1 = model.layernorm1.gamma + bl1 = model.layernorm1.beta + gl2 = model.layernorm2.gamma + bl2 = model.layernorm2.beta + gl3 = model.cross_attention_layernorm.gamma + bl3 = model.cross_attention_layernorm.beta + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(hidden_stats, name + "_input1.fp" + suffix) + saveT(decoder_mask, name + "_input2.fp" + suffix) + saveT(encoder_output, name + "_input3.fp" + suffix) + saveT(memory_mask, name + "_input4.fp" + suffix) + + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(bl1, name + "_weight2.fp" + suffix) + saveT(wt, name + "_weight3.fp" + suffix) + saveT(bt, name + "_weight4.fp" + suffix) + saveT(wp, name + "_weight5.fp" + suffix) + saveT(bp, name + "_weight6.fp" + suffix) + saveT(gl2, name + "_weight7.fp" + suffix) + saveT(bl2, name + "_weight8.fp" + suffix) + saveT(qt2, name + "_weight9.fp" + suffix) + saveT(wt2, name + "_weight10.fp" + suffix) + saveT(bt2, name + "_weight11.fp" + suffix) + saveT(wp2, name + "_weight12.fp" + suffix) + saveT(bp2, name + "_weight13.fp" + suffix) + saveT(gl3, name + "_weight14.fp" + suffix) + saveT(bl3, name + "_weight15.fp" + suffix) + if(ffn_fp16): + saveTensorToHalf(omw, name + "_weight16.fp" + "16") + saveTensorToHalf(omb, name + "_weight17.fp" + "16") + saveTensorToHalf(opw, name + "_weight18.fp" + "16") + else: + saveT(omw, name + "_weight16.fp" + suffix) + saveT(omb, name + "_weight17.fp" + suffix) + saveT(opw, name + "_weight18.fp" + suffix) + saveT(opb, name + "_weight19.fp" + suffix) + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') + f_y=open(f'./{name}_output.txt','w') + saveCalib("output1", np.array(y), f_y)#2 dims + f_y.close() + saveT(y, name + "_output1.fp" + suffix) + def build_transformer_encoder_layer_post_ture(): model = TransformerEncoderLayer(batch_size=2, seq_length=16, @@ -424,8 +680,8 @@ def mha_x1_create(): softmax_compute_type=s_compute_type, app=app ) - q = model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size k = model.dense2.weight.asnumpy()#.transpose() + q = model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size v = model.dense3.weight.asnumpy()#.transpose() w = np.concatenate((q, k, v)) # 3xhid_size x hid_size w = w.transpose() # hid_size x 3xhid_size @@ -678,6 +934,19 @@ def T5_create(): name = "T5" str=" " os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " ) +def vit_create(): + repo = git.Repo('.', search_parent_directories=True) + base = repo.working_tree_dir + name = "vit" + str=" " + os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " ) +def transformer_create(): + repo = git.Repo('.', search_parent_directories=True) + base = repo.working_tree_dir + name = "transformer" + str=" " + os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " ) + def mha_T5_create(): # M.context.set_auto_parallel_context(parallel_mode=M.ParallelMode.SEMI_AUTO_PARALLEL) M.context.set_context(mode=M.context.PYNATIVE_MODE) @@ -692,8 +961,7 @@ def mha_T5_create(): compute_dtype=compute_type, param_init_type=w_compute_type, softmax_compute_type=s_compute_type, - has_bias=False, - app=app + has_bias=False ) print('compute_type',compute_type) q = model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size @@ -736,7 +1004,7 @@ def mha_T5_create(): # print(y.shape) # if app=="ch": f_y=open(f'./{name}_output.txt','w') - saveCalib('Default/projection-_Linear/MatMul-op58', np.array(y), f_y)#2 dims + saveCalib('output1', np.array(y), f_y)#2 dims # elif app=="trc": saveT(y, name + "_output1.fp" + suffix) # tmp = y[1][0].asnumpy().transpose(0, 1, 3, 2) @@ -759,8 +1027,7 @@ def mha_T5_cross_create(): compute_dtype=compute_type, param_init_type=w_compute_type, softmax_compute_type=s_compute_type, - has_bias=False, - app=app + has_bias=False ) qt = model.dense1.weight @@ -830,6 +1097,7 @@ def main(): for i in range(len(sys.argv)): if sys.argv[i]=='-m': model_name=sys.argv[i+1] + print("%s_create()" % model_name) eval("%s_create()" % model_name) if __name__ == "__main__":