From 66b9ac59b986fc3d332dd0a9106d92a28d00b3bf Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Mon, 26 Dec 2022 10:07:51 +0200 Subject: [PATCH 01/39] add decoder op --- .../kernel/nnacl/encoder_layer_parameter.h | 6 +- .../cpu/kernel/nnacl/infer/infer_register.c | 4 +- .../plugin/device/cpu/kernel/nnacl/op_base.h | 1 + mindspore/core/ops/encoder_layer.cc | 9 +- mindspore/core/ops/encoder_layer.h | 4 +- mindspore/core/ops/op_name.h | 8 + mindspore/lite/schema/ops.fbs | 20 +- mindspore/lite/src/common/ops/ops_def.cc | 22 +- .../lite/src/common/ops/ops_func_declare.h | 3 + mindspore/lite/src/common/ops/ops_utils.cc | 1 + .../ops/populate/encoder_layer_populate.cc | 9 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 80 ++---- .../delegate/tensorrt/op/encoder_tensorrt.h | 51 +--- .../delegate/tensorrt/op/mha_tensorrt.cc | 64 +++-- .../delegate/tensorrt/op/mha_tensorrt.h | 51 +--- .../delegate/tensorrt/tensorrt_subgraph.cc | 47 ++-- .../optimizer/fusion/encoder_layer_fusion.cc | 228 ++---------------- .../optimizer/fusion/encoder_layer_fusion.h | 10 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 10 +- trc/transformer/ftBench.py | 8 +- trc/transformer/models.txt | 7 +- trc/transformer/t.config | 2 +- trc/transformer/train_transformer_export.py | 80 ++++++ 24 files changed, 278 insertions(+), 449 deletions(-) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h index 80011172bbf..72bec5345fc 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h @@ -24,7 +24,11 @@ typedef struct EncoderLayerParameter { int head_num_; int head_size_; bool cross_; - bool layernorm_post; + bool post_layernorm_; + float eps_layernorm1_; + float eps_layernorm2_; + int ffn_hidden_size_; + bool position_bias_; } EncoderLayerParameter; #endif // MINDSPORE_NNACL_ENCODERLAYER_PARAMETER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c index 78c778f0daf..60bdfea1a9b 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c @@ -43,7 +43,7 @@ #include "nnacl/infer/common_infer.h" #include "nnacl/infer/concat_infer.h" #include "nnacl/infer/constant_of_shape_infer.h" -#include "nnacl/infer/encoder_layer_infer.h" +#include "nnacl/infer/decoder_layer_infer.h" #ifdef MSLITE_ENABLE_CONTROLFLOW #include "nnacl/infer/control/tensor_array_infer.h" #include "nnacl/infer/control/tensor_array_read_infer.h" @@ -186,6 +186,7 @@ void RegAllInferFunc1() { g_infer_func[PrimType_AssignAdd] = AssignAddInferShape; g_infer_func[PrimType_Attention] = AttentionInferShape; g_infer_func[PrimType_EncoderLayer] = EncoderLayerInferShape; + g_infer_func[PrimType_DecoderLayer] = DecoderLayerInferShape; g_infer_func[PrimType_AudioSpectrogram] = AudioSpectrogramInferShape; g_infer_func[PrimType_AvgPoolFusion] = PoolingInferShape; g_infer_func[PrimType_AvgPoolGrad] = PoolingGradInferShape; @@ -402,6 +403,7 @@ void RegAllInferFunc5() { g_infer_func[PrimType_Where] = WhereInferShape; g_infer_func[PrimType_ZerosLike] = CommonInferShape; g_infer_func[PrimType_EncoderLayer] = EncoderLayerInferShape; + g_infer_func[PrimType_DecoderLayer] = DecoderLayerInferShape; // fused operators. g_inner_op_infer_func[PrimType_Inner_GltextureToOpencl - PrimType_InnerOpMin] = NULL; diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h index e5181b1e851..95acb50e885 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h @@ -37,6 +37,7 @@ #define C8NUM 8 #define C9NUM 9 #define C10NUM 10 +#define C11NUM 11 #define C12NUM 12 #define C13NUM 13 #define C14NUM 14 diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc index 4cf3fb8990d..0cd1dd8fd2d 100644 --- a/mindspore/core/ops/encoder_layer.cc +++ b/mindspore/core/ops/encoder_layer.cc @@ -29,8 +29,6 @@ void EncoderLayer::set_head_size(int64_t head_size) { (void)this->AddAttr(kEncoderLayerSizePerHead, api::MakeValue(head_size)); } -void EncoderLayer::set_cross(bool cross) { (void)this->AddAttr(kCross, api::MakeValue(cross)); } - void EncoderLayer::set_post_layernorm(bool post_layernorm) { (void)this->AddAttr(kEncoderLayerPostLayernorm, api::MakeValue(post_layernorm)); } @@ -55,10 +53,6 @@ int64_t EncoderLayer::get_head_size() const { return GetValue(value_ptr); } -bool EncoderLayer::get_cross() const { - auto value_ptr = this->GetAttr(kCross); - return GetValue(value_ptr); -} bool EncoderLayer::get_post_layernorm() const { auto value_ptr = this->GetAttr(kEncoderLayerPostLayernorm); @@ -83,10 +77,9 @@ bool EncoderLayer::get_position_bias() const { void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, - bool position_bias, bool cross = false, bool post_layernorm = false) { + bool position_bias, bool post_layernorm = false) { this->set_head_num(head_num); this->set_head_size(head_size); - this->set_cross(cross); this->set_post_layernorm(post_layernorm); this->set_eps_layernorm1(eps_layernorm1); this->set_eps_layernorm2(eps_layernorm2); diff --git a/mindspore/core/ops/encoder_layer.h b/mindspore/core/ops/encoder_layer.h index 7c3a02184b8..71cfb70c59e 100644 --- a/mindspore/core/ops/encoder_layer.h +++ b/mindspore/core/ops/encoder_layer.h @@ -44,10 +44,9 @@ class MIND_API EncoderLayer : public BaseOperator { /// \param[in] ffn_hidden_size Define ffn hidden size. /// \param[in] position_bias Define ffn position_bias. void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, - bool position_bias, bool cross, bool post_layernorm); + bool position_bias, bool post_layernorm); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); - void set_cross(bool cross); void set_post_layernorm(bool post_layernorm); void set_eps_layernorm1(float eps_layernorm1); void set_eps_layernorm2(float eps_layernorm2); @@ -55,7 +54,6 @@ class MIND_API EncoderLayer : public BaseOperator { void set_position_bias(bool position_bias); int64_t get_head_num() const; int64_t get_head_size() const; - bool get_cross() const; bool get_post_layernorm() const; float get_eps_layernorm1() const; float get_eps_layernorm2() const; diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h index 41bfb133198..bbee92a21d3 100644 --- a/mindspore/core/ops/op_name.h +++ b/mindspore/core/ops/op_name.h @@ -378,6 +378,14 @@ constexpr auto kEncoderLayerPostLayernorm = "post_layernorm"; constexpr auto kEncoderLayerFfnHiddenSize = "ffn_hidden_size"; constexpr auto kEncoderLayerEpsLayerNorm1 = "eps_layernorm1"; constexpr auto kEncoderLayerEpsLayerNorm2 = "eps_layernorm2"; +constexpr auto kDecoderLayerNumHeads = "head_num"; +constexpr auto kDecoderLayerSizePerHead = "head_size"; +constexpr auto kDecoderLayerPostLayernorm = "post_layernorm"; +constexpr auto kDecoderLayerFfnHiddenSize = "ffn_hidden_size"; +constexpr auto kDecoderLayerEpsLayerNorm1 = "eps_layernorm1"; +constexpr auto kDecoderLayerEpsLayerNorm2 = "eps_layernorm2"; +constexpr auto kDecoderLayerPositionBias1 = "position_bias1"; +constexpr auto kDecoderLayerPositionBias2 = "position_bias2"; constexpr auto kPositionBias = "position_bias"; constexpr size_t kInputIndex0 = 0; diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index ee69ddee851..8616976c161 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -232,6 +232,7 @@ union PrimitiveType { Log1p, TensorScatterAdd, EncoderLayer, + DecoderLayer, } table Abs { @@ -395,7 +396,6 @@ table Attention { head_num: long; head_size: long; cross: bool; - position_bias: bool; } table Conv2DBackpropFilterFusion { @@ -1302,10 +1302,20 @@ table TensorScatterAdd { table EncoderLayer { head_num: long; head_size: long; - cross: bool; post_layernorm: bool; - eps_layernorm1: bool; - eps_layernorm2: bool; - ffn_hidden_size: bool; + eps_layernorm1: float; + eps_layernorm2: float; + ffn_hidden_size: long; position_bias: bool; } + +table DecoderLayer { + head_num: long; + head_size: long; + post_layernorm: bool; + eps_layernorm1: float; + eps_layernorm2: float; + ffn_hidden_size: long; + position_bias1: bool; + position_bias2: bool; +} diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc index a1af85f4dcb..79905910bc8 100644 --- a/mindspore/lite/src/common/ops/ops_def.cc +++ b/mindspore/lite/src/common/ops/ops_def.cc @@ -232,6 +232,7 @@ OP_TYPE(GroupNormFusion) OP_TYPE(Log1p) OP_TYPE(TensorScatterAdd) OP_TYPE(EncoderLayer) +OP_TYPE(DecoderLayer) OP_TYPE_DEF_END(PrimitiveType) OP_SCHEMA_DEF(Abs) @@ -1301,9 +1302,20 @@ OP_SCHEMA_DEF_END(TensorScatterAdd) OP_SCHEMA_DEF(EncoderLayer) OP_ATTR(head_num, long) OP_ATTR(head_size, long); -OP_ATTR(cross, bool) OP_ATTR(post_layernorm, bool) -OP_ATTR(eps_layernorm1, bool) -OP_ATTR(eps_layernorm2, bool) -OP_ATTR(ffn_hidden_size, bool) -OP_SCHEMA_DEF_END(EncoderLayer) \ No newline at end of file +OP_ATTR(eps_layernorm1, float) +OP_ATTR(eps_layernorm2, float) +OP_ATTR(ffn_hidden_size, long) +OP_ATTR(position_bias, bool) +OP_SCHEMA_DEF_END(EncoderLayer) + +OP_SCHEMA_DEF(DecoderLayer) +OP_ATTR(head_num, long) +OP_ATTR(head_size, long); +OP_ATTR(post_layernorm, bool) +OP_ATTR(eps_layernorm1, float) +OP_ATTR(eps_layernorm2, float) +OP_ATTR(ffn_hidden_size, long) +OP_ATTR(position_bias1, bool) +OP_ATTR(position_bias2, bool) +OP_SCHEMA_DEF_END(DecoderLayer) \ No newline at end of file diff --git a/mindspore/lite/src/common/ops/ops_func_declare.h b/mindspore/lite/src/common/ops/ops_func_declare.h index 799e865c7fc..9efc34da884 100644 --- a/mindspore/lite/src/common/ops/ops_func_declare.h +++ b/mindspore/lite/src/common/ops/ops_func_declare.h @@ -261,6 +261,7 @@ #include "ops/format_transpose.h" #include "ops/gather_d.h" #include "ops/tensor_scatter_add.h" +#include "ops/decoder_layer.h" namespace mindspore::lite::ops { #define FUNC_MSOP2SCHEMAOP_DECLARE(OP) std::unique_ptr MSOp2SchemaOp(const mindspore::ops::OP *op); @@ -490,6 +491,8 @@ FUNC_MSOP2SCHEMAOP_DECLARE(GroupNormFusion) FUNC_MSOP2SCHEMAOP_DECLARE(Log1p) FUNC_MSOP2SCHEMAOP_DECLARE(TensorScatterAdd) FUNC_MSOP2SCHEMAOP_DECLARE(EncoderLayer) +FUNC_MSOP2SCHEMAOP_DECLARE(DecoderLayer) + #endif } // namespace mindspore::lite::ops #else diff --git a/mindspore/lite/src/common/ops/ops_utils.cc b/mindspore/lite/src/common/ops/ops_utils.cc index d1baadcee51..55e48e90285 100644 --- a/mindspore/lite/src/common/ops/ops_utils.cc +++ b/mindspore/lite/src/common/ops/ops_utils.cc @@ -273,6 +273,7 @@ REG_MINDSPORE_OPERATOR(GroupNormFusion) REG_MINDSPORE_OPERATOR(Log1p) REG_MINDSPORE_OPERATOR(TensorScatterAdd) REG_MINDSPORE_OPERATOR(EncoderLayer) +REG_MINDSPORE_OPERATOR(DecoderLayer) } // namespace lite } // namespace mindspore diff --git a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc b/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc index 4a478f4e232..64b46708dc5 100644 --- a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc +++ b/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc @@ -34,10 +34,17 @@ OpParameter *PopulateEncoderLayerParameter(const void *prim) { param->op_parameter_.type_ = primitive->value_type(); param->head_num_ = value->head_num(); param->head_size_ = value->head_size(); - param->cross_ = value->cross(); + param->post_layernorm_ = value->post_layernorm(); + param->eps_layernorm1_ = value->eps_layernorm1(); + param->eps_layernorm2_ = value->eps_layernorm2(); + param->position_bias_ = value->position_bias(); + + + return reinterpret_cast(param); } REG_POPULATE(PrimitiveType_EncoderLayer, PopulateEncoderLayerParameter, SCHEMA_CUR) } // namespace lite } // namespace mindspore + diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index a1df34d9f3d..a2d5f410945 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -116,19 +116,27 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { MS_LOG(ERROR) << "op action convert failed"; return RET_ERROR; } - int head_number = encoder_op->get_head_num(); - int head_size = encoder_op->get_head_size(); - bool post_layernorm = encoder_op->get_post_layernorm(); - float eps1 = encoder_op->get_eps_layernorm1(); - float eps2 = encoder_op->get_eps_layernorm2(); - int ffn_hidden_size = encoder_op->get_ffn_hidden_size(); + fastertransformer::encoderParamT params; + memset_s(¶ms, sizeof(params), 0, sizeof(params)); + params.head_num = encoder_op->get_head_num(); + params.head_size = encoder_op->get_head_size(); + params.layernorm_post = encoder_op->get_post_layernorm(); + params.eps1 = encoder_op->get_eps_layernorm1(); + params.eps2 = encoder_op->get_eps_layernorm2(); + params.ffn_hidden_size = encoder_op->get_ffn_hidden_size(); + params.is_cross = false; + params.ffn_fp16 = is_ffn_fp16_; + params.position_bias = encoder_op->get_position_bias(); + params.cublas_handle=GetCublasHandle(); + params.qkv_bias = true; + params.projection_bias = true; + params.hidden_size = params.head_num * params.head_size; + params.in_idx = 0; auto compute_type = runtime_->GetRuntimePrecisionMode(); - bool is_cross = encoder_op->get_cross(); - bool ffn_fp16 = is_ffn_fp16_; - bool is_position_bias = encoder_op->get_position_bias(); + size_t start_fp16=0, end_fp16=0; if(is_ffn_fp16_){ - if (post_layernorm){ + if (params.layernorm_post){ start_fp16=7; end_fp16=11; } else { @@ -140,21 +148,16 @@ if(is_ffn_fp16_){ if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { if (i > start_fp16 && i < end_fp16) { in_tensor.trt_tensor_ = castTensor(ctx, in_tensors_[i], op_name_); - // in_tensor.format_ = Format::NCHW; ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); continue; } in_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx, in_tensors_[i], op_name_); - // in_tensor.format_ = Format::NCHW; ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); } } } nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; - auto plugin = std::make_shared(input_tensor->getName(), compute_type, head_number, head_size, - ffn_hidden_size, is_cross, post_layernorm, is_position_bias, eps1, eps2, ffn_fp16, - - GetCublasHandle(), GetCublasLtHandle(), device_id_); + auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -195,26 +198,9 @@ template int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - // for (int i = 0; i < 14; i++) { - // std::cout << i << ": " << inputDesc[i].type << std::endl; - // printTensor((char *)"tensor", (T *)(inputs[i]), 8); - // } - params_.in_idx = 0; params_.stream = stream; params_.in_idx=0; params_.algo = algoId; -// void **inputs_encdoer=nullptr; -// for (int i=0; i < 14; i++) { -// inputs_encdoer[i]=(void*)(inputs[i]); -// } -// for (int i = 0; i < 14; i++) { -// // std::cout << i << ": " << inputDesc[i].type << std::endl; -// if (i >7 && i < 11) { -// printTensor((char *)"tensor", (half *)(inputs[i]), 8); -// } else -// printTensor((char *)"tensor", (T *)(inputs[i]), 8); -// // } -// } fastertransformer::forwardEncoder((void**)inputs, C14NUM, (void **)outputs, 1, ¶ms_, workspace); return RET_OK; @@ -235,25 +221,9 @@ size_t EncoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, const size_t request_batch_size = static_cast(inputs[0].dims.d[0]); const size_t request_src_seq_len = static_cast(inputs[0].dims.d[1]); const size_t request_tgt_seq_len = request_src_seq_len; - params_.ffn_fp16 = ffn_fp16_; params_.batch_size = request_batch_size; params_.src_seq_len = request_src_seq_len; params_.tgt_seq_len = request_tgt_seq_len; - params_.head_num = head_number_; - params_.head_size = head_size_; - params_.hidden_size = head_number_ * head_size_; - params_.ffn_hidden_size = ffn_hidden_size_; - params_.is_cross = is_cross_; - // handle - params_.cublas_handle = cublas_handle_; - // ctrls - params_.qkv_bias = true; - params_.projection_bias = true; - params_.position_bias = is_position_bias_; - params_.layernorm_post = post_layernrom_; - params_.eps1 = eps1_; - params_.eps2 = eps2_; - if (compute_type_ == RuntimePrecisionMode_FP16) { return fastertransformer::GetEncoderLayerWorkspaceSize(¶ms_); } else { @@ -289,19 +259,11 @@ nvinfer1::IPluginV2DynamicExt *EncoderPlugin::clone() const noexcept { return plugin; } -size_t EncoderPlugin::getSerializationSize() const noexcept { return INPUT_SIZE5 * sizeof(int); } +size_t EncoderPlugin::getSerializationSize() const noexcept { return sizeof(int) + sizeof(fastertransformer::encoderParamT); } void EncoderPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); - SerializeValue(&buffer, &head_number_, sizeof(int)); - SerializeValue(&buffer, &head_size_, sizeof(int)); - SerializeValue(&buffer, &is_cross_, sizeof(bool)); - SerializeValue(&buffer, &ffn_hidden_size_, sizeof(int)); - SerializeValue(&buffer, &eps1_, sizeof(int)); - SerializeValue(&buffer, &eps2_, sizeof(int)); - SerializeValue(&buffer, &post_layernrom_, sizeof(bool)); - SerializeValue(&buffer, &ffn_fp16_, sizeof(bool)); - + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::encoderParamT)); } REGISTER_TENSORRT_CREATOR(ops::kNameEncoderLayer, EncoderTensorRT) } // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index ec6f95f196e..66aed42355a 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -45,48 +45,25 @@ constexpr auto ENCODER_PLUGIN_NAME{"EncoderPlugin"}; class EncoderPlugin : public TensorRTPlugin { public: - EncoderPlugin(const std::string name, int compute_type, int head_number, int head_size, int ffn_hidden_size, bool is_cross, bool post_layernorm, bool is_position_bias, float eps1, float eps2, bool fp16, - cublasHandle_t cublas_handle, cublasLtHandle_t cublaslt_handle, uint32_t device_id) + EncoderPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params, cublasLtHandle_t cublaslt_handle, uint32_t device_id) : TensorRTPlugin(name, std::string(ENCODER_PLUGIN_NAME), device_id), compute_type_(compute_type), - head_number_(head_number), - head_size_(head_size), - ffn_hidden_size_(ffn_hidden_size), - is_cross_(is_cross), - is_position_bias_(is_position_bias), - eps1_(eps1), - eps2_(eps2), - post_layernrom_(post_layernorm), - ffn_fp16_(fp16), - cublas_handle_(cublas_handle), + params_(params), cublaslt_handle_(cublaslt_handle) - {} + {} EncoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; - head_number_ = static_cast(fields[1].data)[0]; - head_size_ = static_cast(fields[2].data)[0]; - is_cross_ = static_cast(fields[3].data)[0]; - ffn_hidden_size_ = static_cast(fields[4].data)[0]; - eps1_ = static_cast(fields[5].data)[0]; - eps2_ = static_cast(fields[6].data)[0]; - post_layernrom_= static_cast(fields[7].data)[0]; - ffn_fp16_= static_cast(fields[8].data)[0]; + params_ = static_cast(fields[1].data)[0]; + cublaslt_handle_ = static_cast(fields[2].data)[0]; } EncoderPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, &head_number_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, &head_size_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, &is_cross_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, &ffn_hidden_size_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, &eps1_, sizeof(float)); - DeserializeValue(&serialData, &serialLength, &eps1_, sizeof(float)); - DeserializeValue(&serialData, &serialLength, &post_layernrom_, sizeof(bool)); - DeserializeValue(&serialData, &serialLength, &ffn_fp16_, sizeof(bool)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamT)); } EncoderPlugin() = delete; @@ -107,11 +84,6 @@ class EncoderPlugin : public TensorRTPlugin { int nbOutputs) noexcept override; private: - mutable fastertransformer::encoderParamT params_; - template - - half* convertToHalf(T* input, int size); - template int RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, @@ -120,16 +92,7 @@ class EncoderPlugin : public TensorRTPlugin { const std::string layer_name_; std::string name_space_; int compute_type_; - int head_number_; - int head_size_; - int ffn_hidden_size_; - bool is_cross_; - bool is_position_bias_; - float eps1_; - float eps2_; - bool post_layernrom_; - bool ffn_fp16_ ; - cublasHandle_t cublas_handle_; + mutable fastertransformer::encoderParamT params_; cublasLtHandle_t cublaslt_handle_; }; class EncoderPluginCreater : public TensorRTPluginCreater { diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index 3d266051a4f..f6416a3266e 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -91,9 +91,18 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { bool is_cross = mha_op->get_cross(); bool is_position_bias = mha_op->get_position_bias(); nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; - - auto plugin = std::make_shared(input_tensor->getName(), compute_type, head_number, head_size,is_position_bias, is_cross, - GetCublasHandle(), GetCublasLtHandle(), device_id_); + fastertransformer::encoderParamT params; + memset_s(¶ms, sizeof(params), 0, sizeof(params)); + params.head_num = head_number; + params.head_size = head_size; + params.hidden_size = head_number * head_size; + params.cublas_handle = GetCublasHandle(); + params.in_idx = 0; + params.qkv_bias = !is_position_bias; + params.projection_bias = !is_position_bias; + params.is_cross = is_cross; + params.position_bias = is_position_bias; + auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -159,16 +168,16 @@ std::vector TensorRTPluginCreater::fields_; int MhaPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept { if (compute_type_ == RuntimePrecisionMode_FP16) { - return RunCudaMha(inputDesc, outputDesc, inputs, outputs, workspace, stream, fast_algo_gemm); + return RunCudaMha(inputDesc, outputDesc, inputs, outputs, workspace, stream, CUBLAS_GEMM_DEFAULT_TENSOR_OP); } else { - return RunCudaMha(inputDesc, outputDesc, inputs, outputs, workspace, stream, fast_algo_gemm); + return RunCudaMha(inputDesc, outputDesc, inputs, outputs, workspace, stream, CUBLAS_GEMM_DEFAULT_TENSOR_OP); } } template int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, - cublasGemmAlgo_t *algoId) { + cublasGemmAlgo_t algoId) { // inputs order: // 0] Q // 1] K @@ -189,7 +198,7 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi // 7] PB // 8] AttnMask int cross_tensor_offset = 0; - if (is_cross_) { + if (params_.is_cross) { cross_tensor_offset = 1; } const int weight_projection_tensor_idx = 4 + cross_tensor_offset; @@ -199,12 +208,12 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi const int weight_qkv_tensor_idx = 3 + cross_tensor_offset; const int position_bias_tensor_idx = 6+ cross_tensor_offset; params_.stream = stream; - params_.algo = algoId[0]; - params_.in_idx = 0; + params_.algo = algoId; + params_.in_idx = 0; int in_len; // TODO position_bias - if (is_cross_ && is_position_bias_) { + if (params_.is_cross && params_.position_bias) { void *inputs_attn[] = {(void *)(inputs[0]), (void *)(inputs[1]), (void *)(inputs[weight_qkv_tensor_idx - 1]), @@ -216,7 +225,7 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi in_len = 7; fastertransformer::forward_attn((T **)inputs_attn, in_len, (T **)outputs, 1, ¶ms_, workspace); } - else if (is_cross_) { + else if (params_.is_cross) { void *inputs_attn[] = {(void *)(inputs[0]), (void *)(inputs[1]), (void *)(inputs[weight_qkv_tensor_idx - 1]), @@ -228,7 +237,7 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi in_len = 8; fastertransformer::forward_attn((T **)inputs_attn, in_len, (T **)outputs, 1, ¶ms_, workspace); - } else if (is_position_bias_) { + } else if (params_.position_bias) { in_len = 5; void *inputs_attn[] = {(void *)(inputs[0]), (void *)(inputs[weight_qkv_tensor_idx]), @@ -268,8 +277,8 @@ size_t MhaPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { int cross_tensor_offset = 0; int position_bias_tensor_offsets = 0; - if (is_cross_) cross_tensor_offset = 1; - if (is_position_bias_) position_bias_tensor_offsets =1; + if (params_.is_cross) cross_tensor_offset = 1; + if (params_.position_bias) position_bias_tensor_offsets =1; const int attn_mask_tensor_idx = 7 + cross_tensor_offset -position_bias_tensor_offsets; const int request_batch_size = static_cast(inputs[attn_mask_tensor_idx].dims.d[0]); const int request_src_seq_len = static_cast(inputs[attn_mask_tensor_idx].dims.d[1]); @@ -277,21 +286,9 @@ size_t MhaPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int params_.batch_size = request_batch_size; params_.src_seq_len = request_src_seq_len; params_.tgt_seq_len = request_tgt_seq_len; - params_.head_num = head_number_; - params_.head_size = head_size_; - params_.hidden_size = head_number_ * head_size_; - params_.cublas_handle = cublas_handle_; - params_.in_idx = 0; - params_.qkv_bias = !is_position_bias_; - params_.projection_bias = !is_position_bias_; - params_.is_cross = is_cross_; - params_.position_bias = is_position_bias_; if (compute_type_ == RuntimePrecisionMode_FP16) { - std::cout<<"fp16 GetAttnWorkspaceSize\n"; - return fastertransformer::GetAttnWorkspaceSize(¶ms_); } else { - std::cout<<"fp32 GetAttnWorkspaceSize\n"; return fastertransformer::GetAttnWorkspaceSize(¶ms_); } } @@ -316,21 +313,21 @@ nvinfer1::DimsExprs MhaPlugin::getOutputDimensions(int32_t index, const nvinfer1 if (num_dims == INPUT_SIZE2) { dims.d[0] = exprBuilder.constant(inputs[nbInputDims - 1].d[0]->getConstantValue() * inputs[nbInputDims - 1].d[1]->getConstantValue()); - auto hidden_size = exprBuilder.constant(head_size_ * head_number_); + auto hidden_size = exprBuilder.constant(params_.head_size * params_.head_num); dims.d[1] = hidden_size; } else if (num_dims == INPUT_SIZE3) { dims.d[0] = inputs[nbInputDims - 1].d[0]; // batch dims.d[1] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - auto hidden_size = exprBuilder.constant(head_size_ * head_number_); + auto hidden_size = exprBuilder.constant(params_.head_size * params_.head_num); dims.d[kTwo] = hidden_size; } } else { // TODO(Haim) - Fix size in case of 2d input dims.nbDims = INPUT_SIZE4; dims.d[0] = inputs[nbInputDims - 1].d[0]; // batch - dims.d[1] = exprBuilder.constant(head_number_); + dims.d[1] = exprBuilder.constant(params_.head_num); dims.d[kTwo] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - dims.d[kThree] = exprBuilder.constant(head_size_); + dims.d[kThree] = exprBuilder.constant(params_.head_size); } #else dims.nbDims = C2NUM; @@ -356,14 +353,11 @@ int MhaPlugin::initialize() noexcept { return 0; } void MhaPlugin::terminate() noexcept {} -size_t MhaPlugin::getSerializationSize() const noexcept { return INPUT_SIZE4 * sizeof(int); } +size_t MhaPlugin::getSerializationSize() const noexcept { return sizeof(int) + sizeof(fastertransformer::encoderParamT); } void MhaPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); - SerializeValue(&buffer, &head_number_, sizeof(int)); - SerializeValue(&buffer, &head_size_, sizeof(int)); - SerializeValue(&buffer, &is_cross_, sizeof(bool)); - SerializeValue(&buffer, &is_position_bias_, sizeof(bool)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::encoderParamT)); } REGISTER_TENSORRT_CREATOR(ops::kNameAttention, MhaTensorRT) } // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h index 8583a440119..73812de7a9e 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h @@ -43,34 +43,25 @@ class MhaTensorRT : public TensorRTOp { constexpr auto MHA_PLUGIN_NAME{"AttentionPlugin"}; class MhaPlugin : public TensorRTPlugin { public: - MhaPlugin(const std::string name, int compute_type, int head_number, int head_size,bool is_position_bias, bool is_cross, - cublasHandle_t cublas_handle, cublasLtHandle_t cublaslt_handle, uint32_t device_id) + MhaPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params, cublasLtHandle_t cublaslt_handle, uint32_t device_id) : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id), compute_type_(compute_type), - head_number_(head_number), - head_size_(head_size), - is_cross_(is_cross), - is_position_bias_(is_position_bias), - cublas_handle_(cublas_handle), - cublaslt_handle_(cublaslt_handle) {} + params_(params), + cublaslt_handle_(cublaslt_handle) + {} MhaPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; - head_number_ = static_cast(fields[1].data)[0]; - head_size_ = static_cast(fields[2].data)[0]; - is_cross_ = static_cast(fields[3].data)[0]; - is_position_bias_ = static_cast(fields[4].data)[0]; + params_ = static_cast(fields[1].data)[0]; + cublaslt_handle_ = static_cast(fields[2].data)[0]; } MhaPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, &head_number_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, &head_size_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, &is_cross_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, &is_position_bias_, sizeof(int)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamT)); } MhaPlugin() = delete; @@ -96,39 +87,15 @@ class MhaPlugin : public TensorRTPlugin { int initialize() noexcept override; private: - mutable fastertransformer::encoderParamT params_; - bool needResize(const int *current_dims, const int *last_dims); template int RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, - cublasGemmAlgo_t *algoId); - template - void SetInnerAddr(void *workspace, size_t size_q, size_t size_k, size_t qk_buf_len, size_t qkv_buf_2_len, - size_t extra_size); - template - void RunPhase1GEMM(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, int *gemm_dims, - int *gemm_lds, cublasOperation_t *gemm_ops, cudaDataType *gemm_data_types, void *alpha, void *beta, - cublasGemmAlgo_t algoId, cudaStream_t stream); - + cublasGemmAlgo_t algoId); const std::string layer_name_; std::string name_space_; int compute_type_; - int head_number_; - int head_size_; - bool is_cross_; - bool is_position_bias_; - cublasGemmAlgo_t fast_algo_gemm[4] = {CUBLAS_GEMM_DEFAULT_TENSOR_OP, CUBLAS_GEMM_DEFAULT_TENSOR_OP, - CUBLAS_GEMM_DEFAULT_TENSOR_OP, CUBLAS_GEMM_DEFAULT_TENSOR_OP}; - - cublasHandle_t cublas_handle_; + mutable fastertransformer::encoderParamT params_; cublasLtHandle_t cublaslt_handle_; - void *qkv_buf_{nullptr}; - void *q_buf_2_{nullptr}; - void *qk_buf_{nullptr}; - void *qkv_buf_2_{nullptr}; - void *qkv_buf_3_{nullptr}; - void *output1_{nullptr}; - void *output2_{nullptr}; }; class MhaPluginCreater : public TensorRTPluginCreater { public: diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc index e2c792c9a01..27ff0a2ffff 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc @@ -934,32 +934,29 @@ bool TensorRTSubGraph::ValidInputResizeDims(const nvinfer1::Dims &construct_dims } return true; } -#define async -#ifdef async -int TensorRTSubGraph::Execute(const std::vector &inputs, std::vector *outputs) { - int ret = lite::SetCudaDevice(device_info_); - if (ret != RET_OK) { - return ret; - } - // std::cout<<"stream_1"<<trt_context_->enqueueV2(tensor_bindings_,stream_, nullptr)) { - MS_LOG(ERROR) << "TensorRT execute failed."; - return RET_ERROR; - } - ret = PostExecute(outputs, false); - if (ret != RET_OK) { - return ret; - } - // std::cout<<"stream_4"< &inputs, std::vector *outputs) { +// int ret = lite::SetCudaDevice(device_info_); +// if (ret != RET_OK) { +// return ret; +// } +// ret = PreExecute(inputs, *outputs, false); +// if (ret != RET_OK) { +// return ret; +// } +// if (!this->trt_context_->enqueueV2(tensor_bindings_,stream_, nullptr)) { +// MS_LOG(ERROR) << "TensorRT execute failed."; +// return RET_ERROR; +// } +// ret = PostExecute(outputs, false); +// if (ret != RET_OK) { +// return ret; +// } +// return cudaStreamSynchronize(stream_); -} -#endif +// } +// #endif #ifndef async int TensorRTSubGraph::Execute(const std::vector &inputs, std::vector *outputs) { int ret = lite::SetCudaDevice(device_info_); diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index 3776af1b502..4da61439ed2 100755 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -109,7 +109,7 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio } VectorRef layer_norm, tuple; if (layernorm_fusion) { - return DefineLayerNorm(reshape1, gamma1_, beta1_); + return DefineLayerNorm(is_position_bias,reshape1, gamma1_, beta1_); } layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_}); auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); @@ -118,7 +118,7 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio return tuple; } -VectorRef EncoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta) const { +VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias,VectorRef input, VarPtr gamma, VarPtr beta) const { auto var1 = std::make_shared("var1"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto is_reduce = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce"); @@ -130,7 +130,7 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, Var auto is_sqr = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSquare), "sqr"); MS_CHECK_TRUE_RET(is_sqr != nullptr, {}); VectorRef sqr; - if (is_position_bias_) { + if (is_position_bias) { sqr = VectorRef({is_sqr, input}); } else { sqr = VectorRef({is_sqr, sub}); @@ -151,11 +151,11 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, Var auto is_div = std::make_shared(std::bind(IsOpType, p1, prim::kPrimRealDiv), "real-div"); MS_CHECK_TRUE_RET(is_div != nullptr, {}); VectorRef real_div; - if (is_position_bias_) { + if (is_position_bias) { real_div = VectorRef({is_div, input, sqr2}); auto is_mul = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMulFusion), "mul"); MS_CHECK_TRUE_RET(is_mul != nullptr, {}); - auto mul = VectorRef({is_mul, real_div, gamma1_}); + auto mul = VectorRef({is_mul, real_div, gamma}); return mul; } else { real_div = VectorRef({is_div, sub, sqr2}); @@ -204,7 +204,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr } VectorRef layer_norm2, tuple2, matmul2; if (layernorm_fusion) { - layer_norm2 = DefineLayerNorm(add, gamma2_, beta2_); + layer_norm2 = DefineLayerNorm(is_position_bias,add, gamma2_, beta2_); tuple2 = layer_norm2; } else { layer_norm2 = VectorRef({is_layernorm2_, add, gamma2_, beta2_}); @@ -255,7 +255,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto reshape4 = VectorRef({is_reshape4, add3, var4}); VectorRef layer_norm, tuple; if (layernorm_fusion) { - layer_norm = DefineLayerNorm(reshape4, gamma1_, beta1_); + layer_norm = DefineLayerNorm(is_position_bias,reshape4, gamma1_, beta1_); tuple = layer_norm; } else { layer_norm = VectorRef({is_layernorm1_, reshape4, gamma1_, beta1_}); @@ -271,9 +271,10 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto reshape5 = VectorRef({is_reshape5, tuple, var5}); return reshape5; } + + std::shared_ptr CastTensors(const std::shared_ptr tensor) { auto &shape = tensor->shape(); - // calculate shape mindspore::TypeId id = kNumberTypeFloat16; auto cast_tensor = std::make_shared(id, shape); @@ -288,84 +289,8 @@ std::shared_ptr CastTensors(const std::shared_ptr(cast_tensor->data_c()) + offset; -// memcpy_s(ptr, cast_tensor->Size() - offset, tensor->data_c(), tensor->Size()); -// offset += tensor->Size(); -// } -// if (transpose) { -// std::vector tshape = {new_shape[1], new_shape[0]}; -// auto transposed_tensor = std::make_shared(base_data_type, tshape); -// switch (base_data_type) { -// case kNumberTypeFloat32: { -// auto status = TransposeMatrix(cast_tensor, transposed_tensor); -// MS_CHECK_TRUE_RET(status == RET_OK, nullptr); -// break; -// } -// case kNumberTypeFloat16: { -// auto status = TransposeMatrix(cast_tensor, transposed_tensor); -// MS_CHECK_TRUE_RET(status == RET_OK, nullptr); -// break; -// } -// default: -// MS_LOG(ERROR) << "unsupported data type " << base_data_type << std::endl; -// } -// return transposed_tensor; -// } -// return cast_tensor; - -// VectorRef EncoderLayerFusion::DefinePatternEncoderLayerNorm(bool post_layernorm = true, -// bool layernorm_fusion = false) const { -// std::cout << "DefinePatternEncoderLayer post=" << post_layernorm << " layernorm_fusion=" << layernorm_fusion -// << std::endl; -// auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); -// MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); -// auto var1 = std::make_shared("var1"); -// MS_CHECK_TRUE_RET(var1 != nullptr, {}); -// auto reshape1 = VectorRef({is_reshape1, input_, var1}); -// auto attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion), -// getTuple(post_layernorm, layernorm_fusion), getTuple(post_layernorm, layernorm_fusion), -// weight_attn_qkv_, weight_attn_o_, bias_attn_qkv_, bias_attn_o_, mask_}); -// auto is_tuple3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme3"); -// auto var_tuple3 = std::make_shared("var_tuple3"); -// auto tuple3 = VectorRef({is_tuple3, attention, var_tuple3}); -// auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add"); -// VectorRef reshape2, matmul1, add; -// add = VectorRef({is_add, getTuple(post_layernorm, layernorm_fusion), tuple3}); -// VectorRef layer_norm2, tuple2; -// layer_norm2 = DefineLayerNorm(add, gamma2_, beta2_); -// tuple2 = layer_norm2; -// auto is_reshape2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder2"); -// MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {}); -// auto var2 = std::make_shared("var2"); -// MS_CHECK_TRUE_RET(var2 != nullptr, {}); -// auto is_matmul1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul1"); -// MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {}); -// if (is_position_bias_){ -// reshape2 = VectorRef({is_reshape2, add, var2}); -// } else{ -// reshape2 = VectorRef({is_reshape2, tuple2, var2}); -// } -// matmul1 = VectorRef({is_matmul1, tuple2, weight_m_, bias_m_}); -// auto is_act = std::make_shared(std::bind(IsOpType, p1, prim::kPrimActivation), "acrivation"); -// MS_CHECK_TRUE_RET(is_act != nullptr, {}); -// auto act = VectorRef({is_act, matmul1}); -// auto is_matmul2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul2"); -// MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {}); -// auto matmul2 = VectorRef({is_matmul2, act, weight_p_, bias_p_}); -// auto is_reshape3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder3"); -// MS_CHECK_TRUE_RET(is_reshape3 != nullptr, {}); -// auto var3 = std::make_shared("var3"); -// MS_CHECK_TRUE_RET(var3 != nullptr, {}); -// auto reshape3 = VectorRef({is_reshape3, matmul2, var3}); -// auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); -// auto add3 = VectorRef({is_add3, reshape2, reshape3}); -// return add3; -// } std::unordered_map EncoderLayerFusion::DefinePatterns() const { std::unordered_map patterns; if (!Init()) { @@ -385,14 +310,13 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - std::cout << "found pattern=" << pattern_name << std::endl; if (pattern_name == kPatternTEncoderLayerPost || pattern_name == kPatternTEncoderLayerPostNorm) { return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true); } else if (pattern_name == kPatternTEncoderLayerPre || pattern_name == kPatternTEncoderLayerPreNorm) { return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false); } else if (pattern_name == kPatternEncoderLayerT5) { is_position_bias_ = true; - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true); + return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false); } return nullptr; } @@ -435,82 +359,6 @@ std::shared_ptr EncoderLayerFusion::BuildEncoderLayerFusionPr return enoder_layer_prim; } -// template -// STATUS EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name, -// const char *attr_name, api::SharedPtr *attr)const { -// if ((*equiv)[node_name] == nullptr || !utils::isa((*equiv)[node_name])) { -// printf("node is not AnfNodePtr"); -// return RET_ERROR; -// } -// auto transpose_users = manager->node_users()[node]; -// auto user_node = transpose_users.front(); -// if (!CheckPrimitiveType(user_node.first, prim::kPrimTranspose)) { -// MS_LOG(ERROR) << " missing transpose node for branch " << index << std::endl; -// return RET_ERROR; -// } -// // connect get item to it -// transpose_users = manager->node_users()[user_node.first]; -// auto get_item = CreateOutputGetItem(func_graph, enoder_layer, index); -// MS_ASSERT(get_item != nullptr); -// if (transpose_users.size() == 1) { -// auto &snode = transpose_users.front(); -// manager->SetEdge(snode.first, snode.second, get_item); -// } else { -// for (auto &snode : transpose_users) { -// if (CheckPrimitiveType(snode.first, prim::kPrimMakeTuple)) { -// manager->SetEdge(snode.first, snode.second, get_item); -// break; -// } -// } -// } -// return RET_OK; -// } - -// CNodePtr EncoderLayerFusion::CreateOutputGetItem(const FuncGraphPtr &func_graph, const CNodePtr &node, -// const int item_index) const { -// MS_ASSERT(func_graph != nullptr); -// MS_ASSERT(node != nullptr); -// auto tuple_get_item_prim = std::make_shared(); -// auto get_item_value = NewValueNode(MakeValue(item_index)); -// if (tuple_get_item_prim == nullptr || get_item_value == nullptr) { -// MS_LOG(ERROR) << "NewValueNode is nullptr"; -// return nullptr; -// } -// auto tuple_get_item_prim_c = tuple_get_item_prim->GetPrim(); -// MS_ASSERT(tuple_get_item_prim_c != nullptr); -// CNodePtr get_item_cnode = func_graph->NewCNode(tuple_get_item_prim_c, {node, get_item_value}); -// MS_CHECK_TRUE_RET(get_item_cnode != nullptr, nullptr); -// auto abstract = lite::CreateTensorAbstract({}, kNumberTypeFloat32); -// if (abstract == nullptr) { -// MS_LOG(ERROR) << "Create tensor abstract failed"; -// return nullptr; -// } -// get_item_cnode->set_abstract(abstract); -// get_item_cnode->set_fullname_with_scope(node->fullname_with_scope() + "_output_getitem_" + -// std::to_string(item_index)); -// return get_item_cnode; -// } - -// STATUS EncoderLayerFusion::SetAbstractTuple(const CNodePtr &cnode, const int output_num) const { -// MS_ASSERT(cnode != nullptr); -// AbstractBasePtrList abstract_list; -// for (int i = 0; i < output_num; ++i) { -// auto abstract = lite::CreateTensorAbstract({}, kNumberTypeFloat32); -// if (abstract == nullptr) { -// MS_LOG(ERROR) << "Create tensor abstract failed"; -// return RET_ERROR; -// } -// abstract_list.emplace_back(abstract); -// } -// auto abstract_tuple = std::make_shared(abstract_list); -// if (abstract_tuple == nullptr) { -// MS_LOG(ERROR) << "create abstract_tuple failed"; -// return RET_ERROR; -// } -// cnode->set_abstract(abstract_tuple); -// return RET_OK; -// } - STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, float *eps1, float *eps2) const { if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { @@ -634,7 +482,7 @@ std::shared_ptr EncoderLayerFusion::CreatePrim(const FuncGrap if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2)) { return nullptr; } - encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, false, post_layernorm); + encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm); return encoder_layer_prim; } @@ -644,44 +492,34 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); - // bool is_position_bias = false; auto input = utils::cast((*equiv)[input_]); - AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2; // bias_m, bias_p, - + AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p; auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); auto weight_m = utils::cast((*equiv)[weight_m_]); auto weight_p = utils::cast((*equiv)[weight_p_]); - // if (!is_position_bias_) { - bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); - bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); - auto bias_m = utils::cast((*equiv)[bias_m_]); - auto bias_p = utils::cast((*equiv)[bias_p_]); - // } - // if (beta1_) { - beta1 = utils::cast((*equiv)[beta1_]); - // } - // if (beta2_) { - beta2 = utils::cast((*equiv)[beta2_]); - // } + if (!is_position_bias_) { + bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); + bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); + bias_m = utils::cast((*equiv)[bias_m_]); + bias_p = utils::cast((*equiv)[bias_p_]); + beta1 = utils::cast((*equiv)[beta1_]); + beta2 = utils::cast((*equiv)[beta2_]); + } else { + position_bias = utils::cast((*equiv)[position_bias_]); + } auto gamma1 = utils::cast((*equiv)[gamma1_]); auto gamma2 = utils::cast((*equiv)[gamma2_]); if (mask_) { input_mask = utils::cast((*equiv)[mask_]); } - - // auto get_item_node = CreateOutputGetItem(func_graph, new_node, 0); - // if (get_item_node == nullptr) { - // MS_LOG(ERROR) << "create enoder_layer output get_item node failed"; - // return nullptr; - // } auto base_shape_ptr = weight_m->Shape(); MS_EXCEPTION_IF_NULL(base_shape_ptr); auto input_shape_ptr = base_shape_ptr->cast(); MS_EXCEPTION_IF_NULL(input_shape_ptr); auto input_shape = input_shape_ptr->shape(); MS_ASSERT(input_shape != nullptr); - int ffn_hidden_size = (int64_t)input_shape[1]; // TODO + int ffn_hidden_size = (int64_t)input_shape[1]; auto encoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size); MS_CHECK_TRUE_RET(encoder_layer_prim != nullptr, nullptr); auto encoder_layer_prim_c = encoder_layer_prim->GetPrim(); @@ -729,28 +567,15 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP return nullptr; } c_weight_m_param->set_name(node->fullname_with_scope() + "/output.mapping.weight"); - - // std::shared_ptr bias_p_tensor = GetTensorInfo(bias_p); - - // auto c_bias_p = CastTensors(bias_p_tensor); - // c_bias_p_param = func_graph->add_parameter(); - // MS_CHECK_TRUE_RET(c_bias_p_param != nullptr, nullptr); - // if (lite::InitParameterFromTensorInfo(c_bias_p_param, c_bias_p) != lite::RET_OK) { - // MS_LOG(ERROR) << "Init parameter from tensor info failed."; - // return nullptr; - // } - // c_bias_p_param->set_name(node->fullname_with_scope() + "/output.projection.bias"); } - - // TODO cross & mask if (is_position_bias_) { position_bias = utils::cast((*equiv)[position_bias_]); if (!post_layernorm) new_node_inputs = {value_node, input, gamma1, weight_qkv, input_mask, - weight_attn_o, gamma2, c_weight_m_param, weight_p, position_bias}; + weight_attn_o, gamma2, weight_m, weight_p, position_bias}; else new_node_inputs = {value_node, input, weight_qkv, input_mask, weight_attn_o, gamma1, - c_weight_m_param, weight_p, gamma2, position_bias}; + weight_m, weight_p, gamma2, position_bias}; } else { if (!post_layernorm) { @@ -775,8 +600,7 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP new_node->set_abstract(old_node->abstract()->Clone()); new_node->set_fullname_with_scope(node->fullname_with_scope() + "/encoder_layer"); - RemoveRedundantInput(func_graph, redundant); - + // RemoveRedundantInput(func_graph, redundant); return new_node; } } // namespace mindspore::opt diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index d0c782239d2..b488ed9167e 100755 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -23,9 +23,9 @@ #include "tools/optimizer/common/multiple_pattern_process_pass.h" #include "include/common/utils/utils.h" #include "include/errorcode.h" -#include "ops/encoder_layer.h" // +#include "ops/encoder_layer.h" #include "multi_head_attention_fusion.h" -#include "ops/fusion/layer_norm_fusion.h" // +#include "ops/fusion/layer_norm_fusion.h" namespace mindspore { namespace opt { @@ -43,15 +43,16 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { protected: virtual bool Init() const; - // create multi-head-attention without mask + // create encoder op virtual std::shared_ptr BuildEncoderLayerFusionPrim(const EquivPtr &equiv) const; private: VectorRef DefinePatternEncoderLayer(bool post_layernorm,bool layernorm_fusion, bool is_position_bias_) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; - VectorRef DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta) const; + VectorRef DefineLayerNorm(bool is_position_bias,VectorRef input, VarPtr gamma, VarPtr beta) const; VectorRef DefinePatternEncoderLayerNorm(bool post_layernorm, bool layernorm_fusion) const; + VectorRef DefinePatternDecoderLayer(bool post_layernorm, bool layernorm_fusion) const; CNodePtr CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,const AnfNodePtr &node, bool post_layernorm ) const; //lite::STATUS GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name, const char *attr_name, api::SharedPtr *attr)const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, @@ -75,6 +76,7 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr beta2_{nullptr}; mutable VarPtr gamma2_{nullptr}; mutable VarPtr weight_attn_qkv_{nullptr}; + mutable VarPtr weight_attn_qkv_cross_{nullptr}; mutable VarPtr weight_attn_o_{nullptr}; mutable VarPtr weight_m_{nullptr}; mutable VarPtr weight_p_{nullptr}; diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 0138e0d6bbb..99e4f5bd9ab 100644 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128] \ No newline at end of file +input_shape=input_ids:[transformer_encoder_layer,128];token_type_ids:[transformer_encoder_layer,128];input_mask:[transformer_encoder_layer,128] diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 5fe08c79245..3b125c8a6c9 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -4,16 +4,16 @@ version=$(cat ${base}/version.txt) system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 benchmark=${system}/tools/benchmark/benchmark server=caspi -gpu_id=3 +gpu_id=2 # move files to caspi model=${1%.mindir} model=${model#convv_} model=$(echo ${model}| sed 's/_fwd//') batch_size=$(echo ${model}| sed 's/bert//') echo "model=${model}" - model_name=$(echo ${model}) -if [[ $model == bert* ]];then -model_name=$(echo ${model}| sed 's/[[:digit:]]//') +model_name=$(echo ${model}) +if [[ "$batch_size" != "${model}" ]];then + model_name=$(echo ${model}| sed 's/[[:digit:]]//') fi if [ "${batch_size}" == "" ] then @@ -34,7 +34,7 @@ rsync -v $1 ${server}:$(realpath $1) rsync -v ${benchmark} ${server}:${benchmark} rsync -vl ${system}/runtime/lib/* ${server}:${system}/runtime/lib/ rsync -vl ${system}/tools/converter/lib/* ${server}:${system}/tools/converter/lib/ -#echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_bert.config +echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_bert.config rsync -v cfg_${model_name}.config ${server}:$(realpath "cfg_${model_name}.config") # this should be more general ! diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 064195292f4..2a43e409c25 100644 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -100,8 +100,8 @@ for line_model_arg in models_arg: if ret != 0: exit() input_files='' output_file='' - os.system(f"./convert_fp32.sh {model_name}_fwd.mindir") - find_output_name(f'convv_{model_name}_fwd.ms', f'{model_name}_output.txt') + # os.system(f"./convert_fp32.sh {model_name}_fwd.mindir") + # find_output_name(f'convv_{model_name}_fwd.ms', f'{model_name}_output.txt') if app=='ch': ret=0 if act == 'be': @@ -123,8 +123,8 @@ for line_model_arg in models_arg: os.system(f"rsync -v {system}/../mindspore-lite-{version}-linux-x64.tar.gz {server}:{system}/..") os.system(f"ssh {server} 'cd {system}/.. && tar -xzf {system}/../mindspore-lite-{version}-linux-x64.tar.gz'") os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") - # os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") - os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) + os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") + # os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) elif app=='trc': #if loop count =1 app=be else app = runtime diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index d48938e396f..36a0873473d 100644 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -5,8 +5,9 @@ #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -H 4 -S 1024 -p 0 -m mha_x1 #-b 1 -l 6 -s 8 -H 8 -S 1024 - 0 -m T5 --b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -m transformer_encoder_layer +-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P False -m transformer_encoder_layer #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert @@ -19,7 +20,7 @@ #-b 32 -l 12 -H 12 -S 768 -s 128 -m bert #num=12 head_size=64 -#-b 16 -l 12 -H 12 -S 768 -s 128 -m bert +#-b 1 -l 12 -H 12 -S 768 -s 128 -m bert #-b 8 -l 12 -H 12 -S 768 -s 128 -m bert #-b 1 -l 12 -H 12 -S 768 -s 64 -m bert #-b 1 -l 24 -H 16 -S 1024 -s 128 -m bert diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 38149334e0f..508acd6ef23 100644 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,3 +1,3 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" -#fusion_blacklists="EncoderLayerFusion" \ No newline at end of file +#fusion_blacklists="EncoderLayerFusion" diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 0de67bba74d..634da322fb5 100644 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -310,6 +310,86 @@ def transformer_encoder_layer_create(): # elif app=="trc": saveT(y, name + "_output1.fp" + suffix) + + + +def transformer_decoder_layer_create(): + name = "transformer_decoder_layer" + if (post_layernorm): + model = TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True) + else: + model = TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num) + hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) + decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + # k = model.attention.dense2.weight.asnumpy()#.transpose() + # v = model.attention.dense3.weight.asnumpy()#.transpose() + + # w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + # w = w.transpose() # hid_size x 3xhid_size + # wt = M.Tensor(w, w_compute_type) + # bq = model.attention.dense1.bias.asnumpy() + # bk = model.attention.dense2.bias.asnumpy() + # bv = model.attention.dense3.bias.asnumpy() + # bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + # bt =M.Tensor(bw, w_compute_type) + # print('bt=',bt) + # wp = model.attention.projection.weight + # bp = model.attention.projection.bias + + # omw = model.output.mapping.weight + # opw = model.output.projection.weight + # omb = model.output.mapping.bias + # opb = model.output.projection.bias + + # gl1 = model.layernorm1.gamma + # bl1 = model.layernorm1.beta + # gl2 = model.layernorm2.gamma + # bl2 = model.layernorm2.beta + + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(hidden_stats, name + "_input1.fp" + suffix) + saveT(decoder_mask, name + "_input2.fp" + suffix) + + # saveT(gl1, name + "_weight1.fp" + suffix) + # saveT(bl1, name + "_weight2.fp" + suffix) + # saveT(wt, name + "_weight3.fp" + suffix) + # saveT(bt, name + "_weight4.fp" + suffix) + # saveT(wp, name + "_weight5.fp" + suffix) + # saveT(bp, name + "_weight6.fp" + suffix) + # saveT(gl2, name + "_weight7.fp" + suffix) + # saveT(bl2, name + "_weight8.fp" + suffix) + # # if app == 'trc': + # # saveTensorToHalf(omw, name + "_weight9.fp" + "16") + # # saveTensorToHalf(omb, name + "_weight10.fp" + "16") + # # saveTensorToHalf(opw, name + "_weight11.fp" + "16") + # # elif app == 'ch': + # saveT(omw, name + "_weight9.fp" + suffix) + # saveT(omb, name + "_weight10.fp" + suffix) + # saveT(opw, name + "_weight11.fp" + suffix) + # saveT(opb, name + "_weight12.fp" + suffix) + _cell_graph_executor.compile(model, + hidden_stats, + decoder_mask) + y = model(hidden_stats, decoder_mask) + export(model, hidden_stats, decoder_mask, file_name= name + "_fwd", file_format='MINDIR') + # if app=="ch": + f_y=open(f'./{name}_output.txt','w') + # # out_name=get_output_encoder_layer(name + "_fwd.mindir") + # # print("name output:",out_name) + saveCalib("output1", np.array(y[0]), f_y)#2 dims + # # print("y.shpae",np.array(y).shape) + # # saveCalib('Default/Add-op267', y, f_y)#2 dims + f_y.close() + # # saveCalib('Default/Reshape-op296', np.array(y), f_y)#2 dims + # # elif app=="trc": + # saveT(y, name + "_output1.fp" + suffix) + + + def build_transformer_encoder_layer_post_ture(): model = TransformerEncoderLayer(batch_size=2, seq_length=16, -- Gitee From 91df5b53a520453657c4e3c37f960e4da6f4b3c0 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Mon, 26 Dec 2022 11:50:54 +0200 Subject: [PATCH 02/39] add decoder op --- trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 2 +- trc/transformer/models.txt | 2 +- trc/transformer/train_transformer_export.py | 21 ++++++++++++--------- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 99e4f5bd9ab..b91785a80ab 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_encoder_layer,128];token_type_ids:[transformer_encoder_layer,128];input_mask:[transformer_encoder_layer,128] +input_shape=input_ids:[16,128];token_type_ids:[16,128];input_mask:[16,128] diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 3b125c8a6c9..f4a21f02f57 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -13,7 +13,7 @@ batch_size=$(echo ${model}| sed 's/bert//') echo "model=${model}" model_name=$(echo ${model}) if [[ "$batch_size" != "${model}" ]];then - model_name=$(echo ${model}| sed 's/[[:digit:]]//') + model_name='bert' fi if [ "${batch_size}" == "" ] then diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 19646ca3882..861ae588665 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -7,7 +7,7 @@ #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer --b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert +#-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index e30e6b1def3..60087f2d732 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -11,7 +11,7 @@ model_zoo_path=os.environ['CLOUD_MODEL_ZOO'] sys.path.append(model_zoo_path) sys.path.append("../../../transformer/transformer/models") sys.path.append("./T5") -from MultiHeadTester import MultiHeadAttentionX,TransformerEncoderLayerX,FeedForwardX +from MultiHeadTester import MultiHeadAttentionX, TransformerDecoderLayerX,TransformerEncoderLayerX,FeedForwardX from mindspore.common.parameter import Parameter from mindspore.common.initializer import Tensor import mindspore as M @@ -308,13 +308,16 @@ def transformer_encoder_layer_create(): def transformer_decoder_layer_create(): name = "transformer_decoder_layer" if (post_layernorm): - model = TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True) else: - model = TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, tgt_seq_length=tgt_seq_len,num_heads=head_num) hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) + # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size # k = model.attention.dense2.weight.asnumpy()#.transpose() # v = model.attention.dense3.weight.asnumpy()#.transpose() @@ -345,7 +348,9 @@ def transformer_decoder_layer_create(): suffix = suffix[-2:] saveT(hidden_stats, name + "_input1.fp" + suffix) saveT(decoder_mask, name + "_input2.fp" + suffix) - + saveT(encoder_output, name + "_input3.fp" + suffix) + saveT(memory_mask, name + "_input4.fp" + suffix) + # saveT(gl1, name + "_weight1.fp" + suffix) # saveT(bl1, name + "_weight2.fp" + suffix) # saveT(wt, name + "_weight3.fp" + suffix) @@ -363,11 +368,9 @@ def transformer_decoder_layer_create(): # saveT(omb, name + "_weight10.fp" + suffix) # saveT(opw, name + "_weight11.fp" + suffix) # saveT(opb, name + "_weight12.fp" + suffix) - _cell_graph_executor.compile(model, - hidden_stats, - decoder_mask) - y = model(hidden_stats, decoder_mask) - export(model, hidden_stats, decoder_mask, file_name= name + "_fwd", file_format='MINDIR') + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') # if app=="ch": f_y=open(f'./{name}_output.txt','w') # # out_name=get_output_encoder_layer(name + "_fwd.mindir") -- Gitee From ba5f2dc6c5b3fd46f63ab92acdfb3fc926c0a1e1 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Mon, 26 Dec 2022 11:54:48 +0200 Subject: [PATCH 03/39] add decoder op --- .../kernel/nnacl/infer/decoder_layer_infer.h | 32 ++ mindspore/core/ops/decoder_layer.cc | 95 ++++ mindspore/core/ops/decoder_layer.h | 67 +++ .../optimizer/fusion/decoder_layer_fusion.cc | 499 ++++++++++++++++++ .../optimizer/fusion/decoder_layer_fusion.h | 83 +++ trc/transformer/cfg_bert.config | 2 +- 6 files changed, 777 insertions(+), 1 deletion(-) create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h create mode 100644 mindspore/core/ops/decoder_layer.cc create mode 100644 mindspore/core/ops/decoder_layer.h create mode 100644 mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc create mode 100644 mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h new file mode 100644 index 00000000000..fb93172b030 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h @@ -0,0 +1,32 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_DECODERLAYER_INFER_H +#define MINDSPORE_NNACL_DECODERLAYER_INFER_H + +#include "nnacl/infer/common_infer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size, + OpParameter *parameter); + +#ifdef __cplusplus +} +#endif +#endif // MINDSPORE_NNACL_DECODERLAYER_INFER_H + diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc new file mode 100644 index 00000000000..7a98395b84f --- /dev/null +++ b/mindspore/core/ops/decoder_layer.cc @@ -0,0 +1,95 @@ + +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ops/decoder_layer.h" +#include "ops/primitive_c.h" +#include "ops/op_utils.h" +#include "mindapi/src/helper.h" + +namespace mindspore::ops { +MIND_API_OPERATOR_IMPL(DecoderLayer, BaseOperator); + +void DecoderLayer::set_head_num(int64_t head_num) { (void)this->AddAttr(kDecoderLayerNumHeads, api::MakeValue(head_num)); } + +void DecoderLayer::set_head_size(int64_t head_size) { + (void)this->AddAttr(kDecoderLayerSizePerHead, api::MakeValue(head_size)); +} + +void DecoderLayer::set_post_layernorm(bool post_layernorm) { + (void)this->AddAttr(kDecoderLayerPostLayernorm, api::MakeValue(post_layernorm)); +} + void DecoderLayer::set_eps_layernorm1(float eps_layernorm1) { + (void)this->AddAttr(kDecoderLayerEpsLayerNorm1, api::MakeValue(eps_layernorm1)); +} + void DecoderLayer::set_eps_layernorm2(float eps_layernorm2) { + (void)this->AddAttr(kDecoderLayerEpsLayerNorm2, api::MakeValue(eps_layernorm2)); + +} + void DecoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size){ + (void)this->AddAttr(kDecoderLayerFfnHiddenSize, api::MakeValue(ffn_hidden_size)); +} +void DecoderLayer::set_position_bias1(bool position_bias1) { (void)this->AddAttr(kDecoderLayerPositionBias1, api::MakeValue(position_bias1)); } +void DecoderLayer::set_position_bias2(bool position_bias2) { (void)this->AddAttr(kDecoderLayerPositionBias2, api::MakeValue(position_bias2)); } +int64_t DecoderLayer::get_head_num() const { + auto value_ptr = this->GetAttr(kDecoderLayerNumHeads); + return GetValue(value_ptr); +} + +int64_t DecoderLayer::get_head_size() const { + auto value_ptr = this->GetAttr(kDecoderLayerSizePerHead); + return GetValue(value_ptr); +} + + +bool DecoderLayer::get_post_layernorm() const { + auto value_ptr = this->GetAttr(kDecoderLayerPostLayernorm); + return GetValue(value_ptr); +} +float DecoderLayer::get_eps_layernorm1() const { + auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm1); + return GetValue(value_ptr); +} +float DecoderLayer::get_eps_layernorm2() const{ + auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm2); + return GetValue(value_ptr); +} +int64_t DecoderLayer::get_ffn_hidden_size() const { + auto value_ptr = this->GetAttr(kDecoderLayerFfnHiddenSize); + return GetValue(value_ptr); +} +bool DecoderLayer::get_position_bias1() const { + auto value_ptr = this->GetAttr(kDecoderLayerPositionBias1); + return GetValue(value_ptr); +} +bool DecoderLayer::get_position_bias2() const { + auto value_ptr = this->GetAttr(kDecoderLayerPositionBias2); + return GetValue(value_ptr); +} + +void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, + bool position_bias1, bool position_bias2, bool post_layernorm = false) { + this->set_head_num(head_num); + this->set_head_size(head_size); + this->set_post_layernorm(post_layernorm); + this->set_eps_layernorm1(eps_layernorm1); + this->set_eps_layernorm2(eps_layernorm2); + this->set_ffn_hidden_size(ffn_hidden_size); + this->set_position_bias1(position_bias1); + this->set_position_bias2(position_bias2); +} +REGISTER_PRIMITIVE_C(kNameDecoderLayer, DecoderLayer); +} // namespace mindspore::ops diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h new file mode 100644 index 00000000000..df843b31c7e --- /dev/null +++ b/mindspore/core/ops/decoder_layer.h @@ -0,0 +1,67 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef LITE_MINDSPORE_LITE_TOOLS_CONVERTER_OPS_DECODERLAYER_H_ +#define LITE_MINDSPORE_LITE_TOOLS_CONVERTER_OPS_DECODERLAYER_H_ +#include +#include +#include +#include + +#include "ops/base_operator.h" +#include "mindapi/base/types.h" + +namespace mindspore { +namespace ops { +constexpr auto kNameDecoderLayer = "DecoderLayer"; +/// \brief MultiHead-Attention op in MindIR. +class MIND_API DecoderLayer : public BaseOperator { + public: + MIND_API_BASE_MEMBER(DecoderLayer); + /// \brief Constructor. + DecoderLayer() : BaseOperator(kNameDecoderLayer) { + InitIOName({"input", "gamma1", "beta1", "weight_attn_qkv", "bias_attn_qkv", "mask", "weight_attn_o", "bias_attn_o", + "gamma2", "beta2", "weight_m", "bias_m", "weight_p", "bias_p"}, + {"output"}); + } + /// \brief Initialize DecoderLayer op. + /// \param[in] head_num Define head number. + /// \param[in] head_size Define size per head. + /// \param[in] eps_layernorm1 Define eps layernorm1. + /// \param[in] eps_layernorm2 Define eps layernorm2. + /// \param[in] ffn_hidden_size Define ffn hidden size. + /// \param[in] position_bias Define ffn position_bias. + void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, + bool position_bias1, bool position_bias2, bool post_layernorm); + void set_head_num(int64_t head_num); + void set_head_size(int64_t head_size); + void set_post_layernorm(bool post_layernorm); + void set_eps_layernorm1(float eps_layernorm1); + void set_eps_layernorm2(float eps_layernorm2); + void set_ffn_hidden_size(int64_t ffn_hidden_size); + void set_position_bias1(bool position_bias1); + void set_position_bias2(bool position_bias2); + int64_t get_head_num() const; + int64_t get_head_size() const; + bool get_post_layernorm() const; + float get_eps_layernorm1() const; + float get_eps_layernorm2() const; + int64_t get_ffn_hidden_size() const; + bool get_position_bias1() const; + bool get_position_bias2() const; + }; +} // namespace ops +} // namespace mindspore +#endif // LITE_MINDSPORE_LITE_TOOLS_CONVERTER_OPS_ATTENTION_H_ diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc new file mode 100644 index 00000000000..870cd3b0d9a --- /dev/null +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -0,0 +1,499 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define USE_DEPRECATED_API +#include "tools/optimizer/fusion/decoder_layer_fusion.h" +#include +#include +#include +#include +#include "tools/optimizer/common/gllo_utils.h" +#include "nnacl/op_base.h" +#include "ops/tuple_get_item.h" +#include "tools/common/tensor_util.h" +#include "ops/op_utils.h" + +namespace mindspore::opt { +namespace { +const auto &p1 = std::placeholders::_1; +const size_t kWeightShapeSize = 2; +const int kDecoderLayerOutputs = 1; +} // namespace +bool DecoderLayerFusion::Init() const { + input_ = std::make_shared("input"); + MS_CHECK_TRUE_RET(input_ != nullptr, false); + beta1_ = std::make_shared("beta1"); + MS_CHECK_TRUE_RET(beta1_ != nullptr, false); + gamma1_ = std::make_shared("gamma1"); + MS_CHECK_TRUE_RET(gamma1_ != nullptr, false); + beta2_ = std::make_shared("beta2"); + MS_CHECK_TRUE_RET(beta2_ != nullptr, false); + gamma2_ = std::make_shared("gamma2"); + MS_CHECK_TRUE_RET(gamma2_ != nullptr, false); + weight_attn_qkv_ = std::make_shared("weight_attn_qkv"); + MS_CHECK_TRUE_RET(weight_attn_qkv_ != nullptr, false); + weight_attn_o_ = std::make_shared(IsParamNode, "weight_attn_o"); + MS_CHECK_TRUE_RET(weight_attn_o_ != nullptr, false); + weight_m_ = std::make_shared(IsParamNode, "weight_m"); + MS_CHECK_TRUE_RET(weight_m_ != nullptr, false); + weight_p_ = std::make_shared(IsParamNode, "weight_p"); + MS_CHECK_TRUE_RET(weight_p_ != nullptr, false); + bias_attn_qkv_ = std::make_shared("bias_attn_qkv"); + MS_CHECK_TRUE_RET(bias_attn_qkv_ != nullptr, false); + bias_attn_o_ = std::make_shared(IsParamNode, "bias_attn_o"); + MS_CHECK_TRUE_RET(bias_attn_o_ != nullptr, false); + bias_m_ = std::make_shared(IsParamNode, "bias_m"); + MS_CHECK_TRUE_RET(bias_m_ != nullptr, false); + bias_p_ = std::make_shared(IsParamNode, "bias_p"); + MS_CHECK_TRUE_RET(bias_p_ != nullptr, false); + mask_ = std::make_shared("mask"); + MS_CHECK_TRUE_RET(mask_ != nullptr, false); + is_attention_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAttention), "is_attention"); + MS_CHECK_TRUE_RET(is_attention_ != nullptr, false); + is_layernorm1_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm1"); + MS_CHECK_TRUE_RET(is_layernorm1_ != nullptr, false); + is_layernorm2_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm2"); + MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false); + position_bias_ = std::make_shared("position_bias"); + MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false); + return true; +} + +// STATUS GetIntParameterData(const ParameterPtr ¶m_ptr, std::vector *result) { +// if (param_ptr == nullptr || !param_ptr->has_default()) { +// MS_LOG(DEBUG) << "param not have default"; +// return RET_ERROR; +// } +// auto default_param = param_ptr->default_param(); +// if (default_param == nullptr || !utils::isa(default_param)) { +// MS_LOG(DEBUG) << "tensor_info is not tensor::TensorPtr"; +// return RET_ERROR; +// } +// auto default_param_ptr = utils::cast(default_param); +// if (default_param_ptr->data_type() != kNumberTypeInt32 && default_param_ptr->data_type() != kNumberTypeInt) { +// MS_LOG(DEBUG) << "default param is not int"; +// return RET_ERROR; +// } +// auto ptr = reinterpret_cast(default_param_ptr->data_c()); +// int64_t shape_size = +// std::accumulate(default_param_ptr->shape().begin(), default_param_ptr->shape().end(), 1, std::multiplies<>()); +// for (int64_t i = 0; i < shape_size; i++) { +// result->emplace_back(ptr[i]); +// } +// return RET_OK; +// } + +VectorRef DecoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusion = false) const { + auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); + MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); + auto var1 = std::make_shared("var1-reshape"); + MS_CHECK_TRUE_RET(var1 != nullptr, {}); + auto reshape1 = VectorRef({is_reshape1, input_, var1}); + if (post_layernorm) { + return reshape1; + } + VectorRef layer_norm, tuple; + if (layernorm_fusion) { + return DefineLayerNorm(reshape1, gamma1_, beta1_); + } + layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_}); + auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); + auto var_tuple = std::make_shared("var_tuple"); + tuple = VectorRef({is_tuple, layer_norm, var_tuple}); + return tuple; +} + +VectorRef DecoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta) const { + auto var1 = std::make_shared("var1"); + MS_CHECK_TRUE_RET(var1 != nullptr, {}); + auto is_reduce = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce"); + MS_CHECK_TRUE_RET(is_reduce != nullptr, {}); + auto reduce1 = VectorRef({is_reduce, input, var1}); + auto is_sub = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSubFusion), "sub-f"); + MS_CHECK_TRUE_RET(is_sub != nullptr, {}); + auto sub = VectorRef({is_sub, input, reduce1}); + auto is_sqr = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSquare), "sqr"); + MS_CHECK_TRUE_RET(is_sqr != nullptr, {}); + auto sqr = VectorRef({is_sqr, input}); + auto var2 = std::make_shared("var2"); + MS_CHECK_TRUE_RET(var2 != nullptr, {}); + auto is_reduce2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce2"); + MS_CHECK_TRUE_RET(is_reduce2 != nullptr, {}); + auto reduce2 = VectorRef({is_reduce2, sqr, var2}); + auto var3 = std::make_shared("var3"); + MS_CHECK_TRUE_RET(var3 != nullptr, {}); + auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is-add"); + MS_CHECK_TRUE_RET(is_add != nullptr, {}); + auto add = VectorRef({is_add, reduce2, var3}); + auto is_sqr2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSqrt), "sqr2"); + MS_CHECK_TRUE_RET(is_sqr2 != nullptr, {}); + auto sqr2 = VectorRef({is_sqr2, add}); + auto is_div = std::make_shared(std::bind(IsOpType, p1, prim::kPrimRealDiv), "real-div"); + MS_CHECK_TRUE_RET(is_div != nullptr, {}); + auto real_div = VectorRef({is_div, input, sqr2}); + auto is_mul = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMulFusion), "mul"); + MS_CHECK_TRUE_RET(is_mul != nullptr, {}); + auto mul = VectorRef({is_mul, real_div, gamma}); + return mul; +} + + +VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = true, bool layernorm_fusion = false) const { + std::cout << "DefinePatternDecoderLayer post=" << post_layernorm << " layernorm_fusion=" << layernorm_fusion + << std::endl; + bool is_position_bias =true; + std::cout << "attention no position bias" << std::endl; + auto attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion), + getTuple(post_layernorm, layernorm_fusion), + getTuple(post_layernorm, layernorm_fusion), weight_attn_qkv_, + weight_attn_qkv_cross_, weight_attn_o_,position_bias_ , mask_}); + return getTuple(post_layernorm, layernorm_fusion); + auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add"); + VectorRef reshape2, matmul1, add; + add = VectorRef({is_add, getTuple(true), attention}); + // } else if (layernorm_fusion) { + // add = VectorRef({is_add, getTuple(post_layernorm, layernorm_fusion), tuple3}); + VectorRef layer_norm2, tuple2, matmul2; + if (layernorm_fusion) { + layer_norm2 = DefineLayerNorm(add, gamma2_, beta2_); + tuple2 = layer_norm2; + } else { + layer_norm2 = VectorRef({is_layernorm2_, add, gamma2_, beta2_}); + auto is_tuple2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item2"); + auto var_tuple2 = std::make_shared("var_tuple2"); + tuple2 = VectorRef({is_tuple2, layer_norm2, var_tuple2}); + } + auto is_reshape2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder2"); + MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {}); + auto var2 = std::make_shared("var2"); + MS_CHECK_TRUE_RET(var2 != nullptr, {}); + auto is_matmul1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul1"); + MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {}); + if (is_position_bias) { + reshape2 = VectorRef({is_reshape2, add, var2}); + matmul1 = VectorRef({is_matmul1, tuple2, weight_m_}); + } else if (post_layernorm || layernorm_fusion) { + reshape2 =VectorRef({is_reshape2, tuple2, var2}); + matmul1 = VectorRef({is_matmul1, tuple2, weight_m_, bias_m_}); + } else { + reshape2 = VectorRef({is_reshape2, add, var2}); + matmul1 = VectorRef({is_matmul1, tuple2, weight_m_, bias_m_}); + } + auto is_act = std::make_shared(std::bind(IsOpType, p1, prim::kPrimActivation), "acrivation"); + MS_CHECK_TRUE_RET(is_act != nullptr, {}); + auto act = VectorRef({is_act, matmul1}); + auto is_matmul2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul2"); + MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {}); + matmul2 = VectorRef({is_matmul2, matmul1, weight_p_}); //, bias_m_}); + + auto is_reshape3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder3"); + MS_CHECK_TRUE_RET(is_reshape3 != nullptr, {}); + auto var3 = std::make_shared("var3"); + MS_CHECK_TRUE_RET(var3 != nullptr, {}); + auto reshape3 = VectorRef({is_reshape3, matmul2, var3}); + auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); + auto add3 = VectorRef({is_add3, reshape2, reshape3}); + if (!post_layernorm || layernorm_fusion) { + return add3; + } + auto is_reshape4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); + MS_CHECK_TRUE_RET(is_reshape4 != nullptr, {}); + auto var4 = std::make_shared("var4"); + MS_CHECK_TRUE_RET(var4 != nullptr, {}); + auto reshape4 = VectorRef({is_reshape4, add3, var4}); + VectorRef layer_norm, tuple; + if (layernorm_fusion) { + layer_norm = DefineLayerNorm(reshape4, gamma1_, beta1_); + tuple = layer_norm; + } else { + layer_norm = VectorRef({is_layernorm1_, reshape4, gamma1_, beta1_}); + + auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); + auto var_tuple = std::make_shared("var_tuple"); + tuple = VectorRef({is_tuple, layer_norm, var_tuple}); + } + auto is_reshape5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); + MS_CHECK_TRUE_RET(is_reshape5 != nullptr, {}); + auto var5 = std::make_shared("var5"); + MS_CHECK_TRUE_RET(var5 != nullptr, {}); + auto reshape5 = VectorRef({is_reshape5, tuple, var5}); + return reshape5; +} + +std::unordered_map DecoderLayerFusion::DefinePatterns() const { + std::unordered_map patterns; + if (!Init()) { + MS_LOG(ERROR) << "initial member failed."; + return patterns; + } + patterns[kPatternDecoderLayer] = DefinePatternDecoderLayer(false,true); + return patterns; +} + +AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mindspore::FuncGraphPtr &func_graph, + const mindspore::AnfNodePtr &node, const mindspore::EquivPtr &equiv) const { + if (func_graph == nullptr || node == nullptr || equiv == nullptr) { + return nullptr; + } + if (pattern_name == kPatternDecoderLayer) { + return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, true); + } + return nullptr; +} + +STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, + int *head_size, float *eps1, float *eps2) const { +// if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { +// printf("is_attention_ is not AnfNodePtr"); +// return RET_ERROR; +// } +// AnfNodePtr node = utils::cast((*equiv)[is_attention_]); +// MS_ASSERT(node != nullptr); +// if (node == nullptr || !utils::isa(node)) { +// auto manager = func_graph->manager(); +// if (manager == nullptr) { +// return RET_ERROR; +// } +// auto users = manager->node_users(); +// auto it = users.find(node); +// if (it != users.end()) { +// node = it->second.front().first; +// } +// if (node == nullptr || !utils::isa(node)) { +// return RET_ERROR; +// } +// } +// auto attn_node = utils::cast(node); +// MS_ASSERT(attn_node != nullptr); +// auto attn_input = attn_node->input(0); +// MS_ASSERT(attn_input != nullptr); +// auto attn_prim = ops::GetOperator(attn_input); +// auto attn_prim_c = attn_prim->GetPrim(); +// if (attn_prim->GetAttr(ops::kEncoderLayerNumHeads) != nullptr) { +// *head_num = attn_prim->get_head_num(); +// } +// if (attn_prim->GetAttr(ops::kAttentionSizePerHead) != nullptr) { +// *head_size = attn_prim->get_head_size(); +// } +// if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { +// is_position_bias_ = attn_prim->get_position_bias(); +// } +// if ((*equiv)[is_layernorm1_] != nullptr) { +// if ((*equiv)[is_layernorm1_] == nullptr || !utils::isa((*equiv)[is_layernorm1_])) { +// printf("is_layernorm1_ is not AnfNodePtr"); +// return RET_ERROR; +// } +// AnfNodePtr node_layrn1 = utils::cast((*equiv)[is_layernorm1_]); +// MS_ASSERT(node_layrn1 != nullptr); +// if (node_layrn1 == nullptr || !utils::isa(node_layrn1)) { +// auto manager = func_graph->manager(); +// if (manager == nullptr) { +// return RET_ERROR; +// } +// auto users = manager->node_users(); +// auto it = users.find(node_layrn1); +// if (it != users.end()) { +// node_layrn1 = it->second.front().first; +// } +// if (node_layrn1 == nullptr || !utils::isa(node_layrn1)) { +// return RET_ERROR; +// } +// } +// auto layrn1_node = utils::cast(node_layrn1); +// MS_ASSERT(layrn1_node != nullptr); +// auto layrn1_input = layrn1_node->input(0); +// MS_ASSERT(layrn1_input != nullptr); +// auto layrn1_prim = ops::GetOperator(layrn1_input); +// auto layrn1_prim_c = layrn1_prim->GetPrim(); +// if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) { +// *eps1 = layrn1_prim->get_epsilon(); +// } +// } +// if ((*equiv)[is_layernorm2_] != nullptr) { +// if ((*equiv)[is_layernorm2_] == nullptr || !utils::isa((*equiv)[is_layernorm2_])) { +// printf("is_layernorm2_ is not AnfNodePtr"); +// return RET_ERROR; +// } +// AnfNodePtr node_layrn2 = utils::cast((*equiv)[is_layernorm2_]); +// MS_ASSERT(node_layrn2 != nullptr); +// if (node_layrn2 == nullptr || !utils::isa(node_layrn2)) { +// auto manager = func_graph->manager(); +// if (manager == nullptr) { +// return RET_ERROR; +// } +// auto users = manager->node_users(); +// auto it = users.find(node_layrn2); +// if (it != users.end()) { +// node_layrn2 = it->second.front().first; +// } +// if (node_layrn2 == nullptr || !utils::isa(node_layrn2)) { +// return RET_ERROR; +// } +// } +// auto layrn2_node = utils::cast(node_layrn2); +// MS_ASSERT(layrn2_node != nullptr); +// auto layrn2_input = layrn2_node->input(0); +// MS_ASSERT(layrn2_input != nullptr); +// auto layrn2_prim = ops::GetOperator(layrn2_input); +// auto layrn2_prim_c = layrn2_prim->GetPrim(); +// if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { +// *eps2 = layrn2_prim->get_epsilon(); +// } +// } + return RET_OK; +} + +std::shared_ptr DecoderLayerFusion::BuildDecoderLayerFusionPrim(const EquivPtr &equiv) const { + MS_ASSERT(equiv != nullptr); + auto decoder_layer_prim = std::make_shared(); + if (decoder_layer_prim == nullptr) { + MS_LOG(ERROR) << "Build decoder_layer primitive failed."; + return decoder_layer_prim; + } + if (!utils::isa((*equiv)[reshape_k_])) { + MS_LOG(ERROR) << "Reshape k is not a parameter"; + return nullptr; + } + + if (!utils::isa((*equiv)[reshape_v_])) { + MS_LOG(ERROR) << "Reshape v is not a parameter"; + return nullptr; + } + + auto reshape_k = utils::cast((*equiv)[reshape_k_]); + std::vector shape_k; +// if (RET_OK != GetIntParameterData(reshape_k, &shape_k)) { +// MS_LOG(ERROR) << "Get reshape k data failed"; +// return nullptr; +// } + + auto reshape_v = utils::cast((*equiv)[reshape_v_]); + std::vector shape_v; +// if (RET_OK != GetIntParameterData(reshape_v, &shape_v)) { +// MS_LOG(ERROR) << "Get reshape k data failed"; +// return nullptr; +// } + if (shape_k.size() < kWeightShapeSize || shape_v.size() < kWeightShapeSize || + shape_k.at(shape_k.size() - kWeightShapeSize) != shape_v.at(shape_v.size() - kWeightShapeSize)) { + MS_LOG(ERROR) << "Shape k or shape v is invalid."; + return nullptr; + } + return decoder_layer_prim; +} + +std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + bool post_layernorm, int64_t ffn_hidden_size) const { + auto encoder_layer_prim = std::make_shared(); + if (encoder_layer_prim == nullptr) { + MS_LOG(ERROR) << "Build decoder layer primitive failed."; + return nullptr; + } + int head_num = 0; + int head_size = 0; + float eps1 = 1e-6; + float eps2 = 1e-6; + if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2)) { + return nullptr; + } +// encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, post_layernorm); + return encoder_layer_prim; +} + +CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + const AnfNodePtr &node, + bool post_layernorm = true) const { + std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; + MS_ASSERT(func_graph != nullptr); + MS_ASSERT(equiv != nullptr); + MS_ASSERT(node != nullptr); + // bool is_position_bias = false; + auto input = utils::cast((*equiv)[input_]); + std::cout << "input" << std::endl; + AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p; + + auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); + std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; + auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); + std::cout << "weight_attn_o" << std::endl; + auto weight_m = utils::cast((*equiv)[weight_m_]); + std::cout << "weight_m" << std::endl; + auto weight_p = utils::cast((*equiv)[weight_p_]); + std::cout << "weight_p" << std::endl; + bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); + bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); + bias_m = utils::cast((*equiv)[bias_m_]); + bias_p = utils::cast((*equiv)[bias_p_]); + beta1 = utils::cast((*equiv)[beta1_]); + std::cout << "beta1" << std::endl; + beta2 = utils::cast((*equiv)[beta2_]); + std::cout << "beta2" << std::endl; + + auto gamma1 = utils::cast((*equiv)[gamma1_]); + std::cout << "gamma1" << std::endl; + auto gamma2 = utils::cast((*equiv)[gamma2_]); + std::cout << "gamma2" << std::endl; + if (mask_) { + input_mask = utils::cast((*equiv)[mask_]); + } + std::cout << "input_mask" << std::endl; + // auto get_item_node = CreateOutputGetItem(func_graph, new_node, 0); + // if (get_item_node == nullptr) { + // MS_LOG(ERROR) << "create decoder_layer output get_item node failed"; + // return nullptr; + // } + auto base_shape_ptr = weight_m->Shape(); + MS_EXCEPTION_IF_NULL(base_shape_ptr); + auto input_shape_ptr = base_shape_ptr->cast(); + MS_EXCEPTION_IF_NULL(input_shape_ptr); + auto input_shape = input_shape_ptr->shape(); + MS_ASSERT(input_shape != nullptr); + int ffn_hidden_size = (int64_t)input_shape[1]; // TODO + auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size); + MS_CHECK_TRUE_RET(decoder_layer_prim != nullptr, nullptr); + auto decoder_layer_prim_c = decoder_layer_prim->GetPrim(); + MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr); + auto value_node = NewValueNode(decoder_layer_prim_c); + MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); + std::cout << "value_node" << std::endl; + std::vector new_node_inputs; + std::vector redundant; + + + // TODO cross & mask + if (!post_layernorm) { + + new_node_inputs = {value_node, input, gamma1, beta1, weight_qkv, bias_attn_qkv, + input_mask, weight_attn_o, bias_attn_o, gamma2, beta2, weight_m, + bias_m, weight_p, bias_p}; + } else { + + new_node_inputs = {value_node, input, weight_qkv, bias_attn_qkv, input_mask, + weight_attn_o, bias_attn_o, gamma1, beta1, weight_m, + bias_m, weight_p, bias_p, gamma2, beta2}; + } + + auto new_node = func_graph->NewCNode(new_node_inputs); + MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); + auto old_node = node->cast(); + MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); + new_node->set_abstract(old_node->abstract()->Clone()); + new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); + + // RemoveRedundantInput(func_graph, redundant); + std::cout << "RemoveRedundantInput" << std::endl; + return new_node; +} +} // namespace mindspore::opt \ No newline at end of file diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h new file mode 100644 index 00000000000..7ccf5078050 --- /dev/null +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -0,0 +1,83 @@ + // /** +// * Copyright 2021 Huawei Technologies Co., Ltd +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// */ +#ifndef MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODERLAYER_FUSION_H_ +#define MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODERLAYER_FUSION_H_ + +#include +#include +#include +#include +#include "tools/optimizer/common/multiple_pattern_process_pass.h" +#include "include/common/utils/utils.h" +#include "include/errorcode.h" +#include "ops/decoder_layer.h" // +#include "multi_head_attention_fusion.h" +#include "ops/fusion/layer_norm_fusion.h" // + +namespace mindspore { +namespace opt { +class DecoderLayerFusion : public MultiplePatternProcessPass { + public: + explicit DecoderLayerFusion(const std::string &name = "DecoderLayerFusion", bool multigraph = true) + : MultiplePatternProcessPass(name, multigraph) {} + + ~DecoderLayerFusion() override = default; + + AnfNodePtr Process(const std::string &pattern_name, const FuncGraphPtr &, const AnfNodePtr &, + const EquivPtr &) const override; + std::unordered_map DefinePatterns() const override; +protected: + virtual bool Init() const; + + // create multi-head-attention without mask + virtual std::shared_ptr BuildDecoderLayerFusionPrim(const EquivPtr &equiv) const; + + private: + VectorRef DefinePatternDecoderLayer(bool post_layernorm,bool layernorm_fusion) const; + VectorRef getTuple(bool post_layernorm, bool layernorm_fusion) const; + VectorRef DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta) const; + CNodePtr CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,const AnfNodePtr &node, bool post_layernorm ) const; + std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + bool post_layernorm, int64_t ffn_hidden_size) const; + lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, + int *head_size, float *eps1, float *eps2) const; + protected: + const std::string kPatternDecoderLayer = "PatternDecoderLayer"; + mutable VarPtr input_{nullptr}; + mutable VarPtr position_bias_{nullptr}; + mutable VarPtr beta1_{nullptr}; + mutable VarPtr gamma1_{nullptr}; + mutable VarPtr beta2_{nullptr}; + mutable VarPtr gamma2_{nullptr}; + mutable VarPtr weight_attn_qkv_{nullptr}; + mutable VarPtr weight_attn_qkv_cross_{nullptr}; + mutable VarPtr weight_attn_o_{nullptr}; + mutable VarPtr weight_m_{nullptr}; + mutable VarPtr weight_p_{nullptr}; + mutable VarPtr bias_attn_qkv_{nullptr}; + mutable VarPtr bias_attn_o_{nullptr}; + mutable VarPtr bias_m_{nullptr}; + mutable VarPtr bias_p_{nullptr}; + mutable VarPtr mask_{nullptr}; + mutable VarPtr is_attention_{nullptr}; + mutable VarPtr reshape_k_{nullptr}; + mutable VarPtr reshape_v_{nullptr}; + mutable VarPtr is_layernorm1_{nullptr}; + mutable VarPtr is_layernorm2_{nullptr}; + }; +} // namespace opt +} // namespace mindspore +#endif // MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODERLAYER_FUSION_H_ \ No newline at end of file diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index b91785a80ab..46d7db91648 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[16,128];token_type_ids:[16,128];input_mask:[16,128] +input_shape=input_ids:[mha_cross,128];token_type_ids:[mha_cross,128];input_mask:[mha_cross,128] -- Gitee From f01df8258407e1e65288feae7134d8ce7b7e9711 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Mon, 26 Dec 2022 12:58:36 +0200 Subject: [PATCH 04/39] check if act gelu --- .../optimizer/fusion/encoder_layer_fusion.cc | 45 +++++++++- .../optimizer/fusion/encoder_layer_fusion.h | 2 + trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 10 +-- trc/transformer/models.txt | 11 +-- trc/transformer/train_transformer_export.py | 85 ++++++++++++++++++- 6 files changed, 140 insertions(+), 15 deletions(-) diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index 6f260809450..8aa6e09f966 100755 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -70,6 +70,8 @@ bool EncoderLayerFusion::Init() const { MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false); position_bias_ = std::make_shared("position_bias"); MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false); + is_act_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimActivation), "activation"); + MS_CHECK_TRUE_RET(is_act_ != nullptr, {}); return true; } @@ -228,9 +230,8 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr reshape2 = VectorRef({is_reshape2, add, var2}); matmul1 = VectorRef({is_matmul1, tuple2, weight_m_, bias_m_}); } - auto is_act = std::make_shared(std::bind(IsOpType, p1, prim::kPrimActivation), "acrivation"); - MS_CHECK_TRUE_RET(is_act != nullptr, {}); - auto act = VectorRef({is_act, matmul1}); + + auto act = VectorRef({is_act_, matmul1}); auto is_matmul2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul2"); MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {}); if (is_position_bias) { @@ -321,6 +322,41 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi return nullptr; } + +bool IsActGELU(const FuncGraphPtr &func_graph,const EquivPtr &equiv, const VarPtr &input_prim) { + if ((*equiv)[input_prim] == nullptr || !utils::isa((*equiv)[input_prim])) { + printf("is_attention_ is not AnfNodePtr"); + return RET_ERROR; + } + AnfNodePtr node = utils::cast((*equiv)[input_prim]); + MS_ASSERT(node != nullptr); + if (node == nullptr || !utils::isa(node)) { + auto manager = func_graph->manager(); + if (manager == nullptr) { + return RET_ERROR; + } + auto users = manager->node_users(); + auto it = users.find(node); + if (it != users.end()) { + node = it->second.front().first; + } + if (node == nullptr || !utils::isa(node)) { + return RET_ERROR; + } + } + MS_ASSERT(equiv != nullptr && input_prim); + auto act_node = utils::cast(node); + MS_ASSERT(act_node != nullptr); + auto act_input = act_node->input(0); + MS_ASSERT(act_input != nullptr); + auto act_primitive = ops::GetOperator(act_input); + MS_CHECK_TRUE_RET(act_primitive != nullptr, false); + auto act_primitive_c = act_primitive->GetPrim(); + if (act_primitive_c->GetAttr(ops::kActivationType) == nullptr || act_primitive->get_activation_type() != mindspore::GELU) { + return false; + } + return true; +} std::shared_ptr EncoderLayerFusion::BuildEncoderLayerFusionPrim(const EquivPtr &equiv) const { MS_ASSERT(equiv != nullptr); auto enoder_layer_prim = std::make_shared(); @@ -458,6 +494,9 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq *eps2 = layrn2_prim->get_epsilon(); } } + if (!IsActGELU(func_graph,equiv, is_act_)) { + return false; + } return RET_OK; } STATUS EncoderLayerFusion::RemoveRedundantInput(const FuncGraphPtr &func_graph, diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index b488ed9167e..331eca44550 100755 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -26,6 +26,7 @@ #include "ops/encoder_layer.h" #include "multi_head_attention_fusion.h" #include "ops/fusion/layer_norm_fusion.h" +#include "ops/fusion/activation.h" namespace mindspore { namespace opt { @@ -71,6 +72,7 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { const std::string kPatternEncoderLayerT5 = "PatternEncoderLayerT5"; mutable VarPtr input_{nullptr}; mutable VarPtr position_bias_{nullptr}; + mutable VarPtr is_act_{nullptr}; mutable VarPtr beta1_{nullptr}; mutable VarPtr gamma1_{nullptr}; mutable VarPtr beta2_{nullptr}; diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 0138e0d6bbb..99e4f5bd9ab 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128] \ No newline at end of file +input_shape=input_ids:[transformer_encoder_layer,128];token_type_ids:[transformer_encoder_layer,128];input_mask:[transformer_encoder_layer,128] diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 5fe08c79245..f4a21f02f57 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -4,16 +4,16 @@ version=$(cat ${base}/version.txt) system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 benchmark=${system}/tools/benchmark/benchmark server=caspi -gpu_id=3 +gpu_id=2 # move files to caspi model=${1%.mindir} model=${model#convv_} model=$(echo ${model}| sed 's/_fwd//') batch_size=$(echo ${model}| sed 's/bert//') echo "model=${model}" - model_name=$(echo ${model}) -if [[ $model == bert* ]];then -model_name=$(echo ${model}| sed 's/[[:digit:]]//') +model_name=$(echo ${model}) +if [[ "$batch_size" != "${model}" ]];then + model_name='bert' fi if [ "${batch_size}" == "" ] then @@ -34,7 +34,7 @@ rsync -v $1 ${server}:$(realpath $1) rsync -v ${benchmark} ${server}:${benchmark} rsync -vl ${system}/runtime/lib/* ${server}:${system}/runtime/lib/ rsync -vl ${system}/tools/converter/lib/* ${server}:${system}/tools/converter/lib/ -#echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_bert.config +echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_bert.config rsync -v cfg_${model_name}.config ${server}:$(realpath "cfg_${model_name}.config") # this should be more general ! diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 66ae10d56a7..10444fc2d8c 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,16 +1,17 @@ #run the following tests before push -#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -#-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross -#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 -#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross +-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer --b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert +#-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 +-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_decoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index d969e1ad51f..60087f2d732 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -11,7 +11,7 @@ model_zoo_path=os.environ['CLOUD_MODEL_ZOO'] sys.path.append(model_zoo_path) sys.path.append("../../../transformer/transformer/models") sys.path.append("./T5") -from MultiHeadTester import MultiHeadAttentionX,TransformerEncoderLayerX,FeedForwardX +from MultiHeadTester import MultiHeadAttentionX, TransformerDecoderLayerX,TransformerEncoderLayerX,FeedForwardX from mindspore.common.parameter import Parameter from mindspore.common.initializer import Tensor import mindspore as M @@ -302,6 +302,89 @@ def transformer_encoder_layer_create(): # elif app=="trc": saveT(y, name + "_output1.fp" + suffix) + + + +def transformer_decoder_layer_create(): + name = "transformer_decoder_layer" + if (post_layernorm): + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True) + else: + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num) + hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) + decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) + + # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + # k = model.attention.dense2.weight.asnumpy()#.transpose() + # v = model.attention.dense3.weight.asnumpy()#.transpose() + + # w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + # w = w.transpose() # hid_size x 3xhid_size + # wt = M.Tensor(w, w_compute_type) + # bq = model.attention.dense1.bias.asnumpy() + # bk = model.attention.dense2.bias.asnumpy() + # bv = model.attention.dense3.bias.asnumpy() + # bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + # bt =M.Tensor(bw, w_compute_type) + # print('bt=',bt) + # wp = model.attention.projection.weight + # bp = model.attention.projection.bias + + # omw = model.output.mapping.weight + # opw = model.output.projection.weight + # omb = model.output.mapping.bias + # opb = model.output.projection.bias + + # gl1 = model.layernorm1.gamma + # bl1 = model.layernorm1.beta + # gl2 = model.layernorm2.gamma + # bl2 = model.layernorm2.beta + + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(hidden_stats, name + "_input1.fp" + suffix) + saveT(decoder_mask, name + "_input2.fp" + suffix) + saveT(encoder_output, name + "_input3.fp" + suffix) + saveT(memory_mask, name + "_input4.fp" + suffix) + + # saveT(gl1, name + "_weight1.fp" + suffix) + # saveT(bl1, name + "_weight2.fp" + suffix) + # saveT(wt, name + "_weight3.fp" + suffix) + # saveT(bt, name + "_weight4.fp" + suffix) + # saveT(wp, name + "_weight5.fp" + suffix) + # saveT(bp, name + "_weight6.fp" + suffix) + # saveT(gl2, name + "_weight7.fp" + suffix) + # saveT(bl2, name + "_weight8.fp" + suffix) + # # if app == 'trc': + # # saveTensorToHalf(omw, name + "_weight9.fp" + "16") + # # saveTensorToHalf(omb, name + "_weight10.fp" + "16") + # # saveTensorToHalf(opw, name + "_weight11.fp" + "16") + # # elif app == 'ch': + # saveT(omw, name + "_weight9.fp" + suffix) + # saveT(omb, name + "_weight10.fp" + suffix) + # saveT(opw, name + "_weight11.fp" + suffix) + # saveT(opb, name + "_weight12.fp" + suffix) + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') + # if app=="ch": + f_y=open(f'./{name}_output.txt','w') + # # out_name=get_output_encoder_layer(name + "_fwd.mindir") + # # print("name output:",out_name) + saveCalib("output1", np.array(y[0]), f_y)#2 dims + # # print("y.shpae",np.array(y).shape) + # # saveCalib('Default/Add-op267', y, f_y)#2 dims + f_y.close() + # # saveCalib('Default/Reshape-op296', np.array(y), f_y)#2 dims + # # elif app=="trc": + # saveT(y, name + "_output1.fp" + suffix) + + + def build_transformer_encoder_layer_post_ture(): model = TransformerEncoderLayer(batch_size=2, seq_length=16, -- Gitee From 54028d1c429b4231b3a814efd7a9e476221e82ad Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Thu, 29 Dec 2022 12:33:00 +0200 Subject: [PATCH 05/39] fix decoder layer --- .../plugin/device/cpu/kernel/nnacl/op_base.h | 3 + mindspore/core/ops/decoder_layer.cc | 8 + mindspore/core/ops/decoder_layer.h | 6 +- mindspore/core/ops/op_name.h | 1 + mindspore/lite/schema/ops.fbs | 1 + mindspore/lite/src/common/ops/ops_def.cc | 1 + .../lite/tools/converter/anf_transform.cc | 2 + .../fusion/multi_head_attention_fusion.cc | 2 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 1 + trc/transformer/ftBench.py | 12 +- trc/transformer/models.txt | 11 +- trc/transformer/t.config | 2 +- trc/transformer/train_transformer_export.py | 250 +++++++++++++++--- 14 files changed, 249 insertions(+), 53 deletions(-) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h index 95acb50e885..4c0bce6ef42 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h @@ -42,8 +42,11 @@ #define C13NUM 13 #define C14NUM 14 #define C16NUM 16 +#define C18NUM 18 #define C20NUM 20 #define C21NUM 21 +#define C22NUM 22 +#define C23NUM 23 #define C24NUM 24 #define C28NUM 28 #define C32NUM 32 diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc index 7a98395b84f..baedf3ec902 100644 --- a/mindspore/core/ops/decoder_layer.cc +++ b/mindspore/core/ops/decoder_layer.cc @@ -38,6 +38,10 @@ void DecoderLayer::set_post_layernorm(bool post_layernorm) { void DecoderLayer::set_eps_layernorm2(float eps_layernorm2) { (void)this->AddAttr(kDecoderLayerEpsLayerNorm2, api::MakeValue(eps_layernorm2)); +} +void DecoderLayer::set_eps_layernorm3(float eps_layernorm3) { + (void)this->AddAttr(kDecoderLayerEpsLayerNorm3, api::MakeValue(eps_layernorm3)); + } void DecoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size){ (void)this->AddAttr(kDecoderLayerFfnHiddenSize, api::MakeValue(ffn_hidden_size)); @@ -67,6 +71,10 @@ float DecoderLayer::get_eps_layernorm2() const{ auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm2); return GetValue(value_ptr); } +float DecoderLayer::get_eps_layernorm3() const{ + auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm3); + return GetValue(value_ptr); +} int64_t DecoderLayer::get_ffn_hidden_size() const { auto value_ptr = this->GetAttr(kDecoderLayerFfnHiddenSize); return GetValue(value_ptr); diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h index df843b31c7e..c09ce14b7d5 100644 --- a/mindspore/core/ops/decoder_layer.h +++ b/mindspore/core/ops/decoder_layer.h @@ -41,8 +41,10 @@ class MIND_API DecoderLayer : public BaseOperator { /// \param[in] head_size Define size per head. /// \param[in] eps_layernorm1 Define eps layernorm1. /// \param[in] eps_layernorm2 Define eps layernorm2. + /// \param[in] eps_layernorm3 Define eps layernorm3. /// \param[in] ffn_hidden_size Define ffn hidden size. - /// \param[in] position_bias Define ffn position_bias. + /// \param[in] position_bias1 Define ffn position_bias1. + /// \param[in] position_bias2 Define ffn position_bias2. void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, bool post_layernorm); void set_head_num(int64_t head_num); @@ -50,6 +52,7 @@ class MIND_API DecoderLayer : public BaseOperator { void set_post_layernorm(bool post_layernorm); void set_eps_layernorm1(float eps_layernorm1); void set_eps_layernorm2(float eps_layernorm2); + void set_eps_layernorm3(float eps_layernorm2); void set_ffn_hidden_size(int64_t ffn_hidden_size); void set_position_bias1(bool position_bias1); void set_position_bias2(bool position_bias2); @@ -58,6 +61,7 @@ class MIND_API DecoderLayer : public BaseOperator { bool get_post_layernorm() const; float get_eps_layernorm1() const; float get_eps_layernorm2() const; + float get_eps_layernorm3() const; int64_t get_ffn_hidden_size() const; bool get_position_bias1() const; bool get_position_bias2() const; diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h index bbee92a21d3..debbd01c3c0 100644 --- a/mindspore/core/ops/op_name.h +++ b/mindspore/core/ops/op_name.h @@ -384,6 +384,7 @@ constexpr auto kDecoderLayerPostLayernorm = "post_layernorm"; constexpr auto kDecoderLayerFfnHiddenSize = "ffn_hidden_size"; constexpr auto kDecoderLayerEpsLayerNorm1 = "eps_layernorm1"; constexpr auto kDecoderLayerEpsLayerNorm2 = "eps_layernorm2"; +constexpr auto kDecoderLayerEpsLayerNorm3 = "eps_layernorm3"; constexpr auto kDecoderLayerPositionBias1 = "position_bias1"; constexpr auto kDecoderLayerPositionBias2 = "position_bias2"; constexpr auto kPositionBias = "position_bias"; diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index 8616976c161..dcf02a87f62 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -1315,6 +1315,7 @@ table DecoderLayer { post_layernorm: bool; eps_layernorm1: float; eps_layernorm2: float; + eps_layernorm3: float; ffn_hidden_size: long; position_bias1: bool; position_bias2: bool; diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc index 79905910bc8..d3142f81684 100644 --- a/mindspore/lite/src/common/ops/ops_def.cc +++ b/mindspore/lite/src/common/ops/ops_def.cc @@ -1315,6 +1315,7 @@ OP_ATTR(head_size, long); OP_ATTR(post_layernorm, bool) OP_ATTR(eps_layernorm1, float) OP_ATTR(eps_layernorm2, float) +OP_ATTR(eps_layernorm3, float) OP_ATTR(ffn_hidden_size, long) OP_ATTR(position_bias1, bool) OP_ATTR(position_bias2, bool) diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc index d0579a0afdf..0f080aa646e 100644 --- a/mindspore/lite/tools/converter/anf_transform.cc +++ b/mindspore/lite/tools/converter/anf_transform.cc @@ -52,6 +52,7 @@ #include "tools/optimizer/fusion/tensor_dot_fusion.h" #include "tools/optimizer/fusion/multi_head_attention_fusion.h" #include "tools/optimizer/fusion/encoder_layer_fusion.h" +#include "tools/optimizer/fusion/decoder_layer_fusion.h" #include "tools/optimizer/fusion/glu_fusion.h" #include "tools/optimizer/fusion/tflite_rel_pos_multi_head_attention_fusion.h" @@ -319,6 +320,7 @@ int AnfTransform::RunFusionPass(const FuncGraphPtr &old_graph, const std::shared #ifdef ENABLE_CLOUD_FUSION_INFERENCE fusions.push_back(std::make_shared()); fusions.push_back(std::make_shared()); + fusions.push_back(std::make_shared()); #endif for (size_t index = 0; index < fusions.size(); index++) { auto pass_ptr = fusions.at(index); diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index ebae28b78e2..b39baa2ddaa 100755 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -837,7 +837,7 @@ CNodePtr MultiHeadAttentionFusion::CreateMaskedMultiHeadAttentionNode(const Func ret = FetchShapeFromAbstract(input_v->abstract(), &inputv_shape); MS_CHECK_TRUE_RET(ret == RET_OK, nullptr); // test for cross - if ((inputq_shape != inputv_shape) || ((match_count_ > 1) && (input_q != input_v))) { + if ((inputq_shape != inputv_shape) || (input_q != input_v)) { cross = true; } if (!cross && !t5_x_) { diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 46d7db91648..cc543ad3d77 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[mha_cross,128];token_type_ids:[mha_cross,128];input_mask:[mha_cross,128] +input_shape=input_ids:[transformer_decoder_layer,128];token_type_ids:[transformer_decoder_layer,128];input_mask:[transformer_decoder_layer,128] diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index f4a21f02f57..d2a104eb521 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -5,6 +5,7 @@ system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linu benchmark=${system}/tools/benchmark/benchmark server=caspi gpu_id=2 + # move files to caspi model=${1%.mindir} model=${model#convv_} diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 522f2450b8f..4a0886ca9c1 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -118,12 +118,12 @@ for line_model_arg in models_arg: os.system('./trc/release.sh x86') os.system(f"cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}" ) else: - - with open(f'cfg_{model_name}.config','w') as f: - if model_name == 'bert': - f.write(f"[gpu_context]\ninput_shape=input_ids:[{batch_size},{seq}];token_type_ids:[{batch_size},{seq}];input_mask:[{batch_size},{seq}]") - elif model_name == 'transformer_encoder_layer': - f.write(f"[gpu_context]\ninput_shape=x:[{batch_size},{seq},{hidden_size}];input_mask:[{batch_size},{seq},{seq}]") + if model_name in ['bert','transformer_encoder_layer']: + with open(f'cfg_{model_name}.config','w') as f: + if model_name == 'bert': + f.write(f"[gpu_context]\ninput_shape=input_ids:[{batch_size},{seq}];token_type_ids:[{batch_size},{seq}];input_mask:[{batch_size},{seq}]") + elif model_name == 'transformer_encoder_layer': + f.write(f"[gpu_context]\ninput_shape=x:[{batch_size},{seq},{hidden_size}];input_mask:[{batch_size},{seq},{seq}]") os.system(f"ssh {server} 'rm -f {system}/../mindspore-lite-{version}-linux-x64.tar.gz {work_dir}/*{model_name}*'") os.system(f"ssh {server} 'mkdir -p {benchmark}'") os.system(f"rsync -v {system}/../mindspore-lite-{version}-linux-x64.tar.gz {server}:{system}/..") diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 861ae588665..e88dab61233 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -9,6 +9,9 @@ #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert + +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer + #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 @@ -18,9 +21,13 @@ #-b 1 -l 6 -s 8 -H 8 -S 1024 - 0 -m T5 #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -m transformer_encoder_layer --b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_decoder_layer + +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P False -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer_t5 +#-b 1 -l 66 -s 128 -t 128 -H 12 -S 768 -p 0 -m mha_cross +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert #-b 1 -l 12 -H 2 -S 8 -s 20 -m T5 #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 508acd6ef23..336dbdc28f3 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,3 +1,3 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" -#fusion_blacklists="EncoderLayerFusion" +fusion_blacklists="EncoderLayerFusion","DecoderLayerFusion" diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 60087f2d732..407de65b428 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -105,6 +105,7 @@ eps2=1e-6 post_layernorm=True ffn_hidden_size=-1 app="ch" +ffn_fp16 = False def read_args(): global batch global seq @@ -122,6 +123,7 @@ def read_args(): global in_type global w_type global app + global ffn_fp16 print("sys argv = ", sys.argv) for i in range(len(sys.argv)) : if sys.argv[i] == '-b': @@ -210,7 +212,14 @@ def read_args(): print("error: illegal compute type {}".format(sys.argv[i + 1]) ) else: app = sys.argv[i + 1] - print("app=",app) + elif sys.argv[i] == '-x': + if sys.argv[i + 1] not in ["0", "1"]: + print("error: illegal compute type {}".format(sys.argv[i + 1]) ) + else: + if sys.argv[i + 1]=='0': + ffn_fp16 = False + else: + ffn_fp16 = True size_per_head=hid_size//head_num tgt_seq_len = tgt_seq_len if (tgt_seq_len != -1) else seq ffn_hidden_size = ffn_hidden_size if (ffn_hidden_size != -1) else 4*hid_size @@ -281,11 +290,11 @@ def transformer_encoder_layer_create(): saveT(bp, name + "_weight6.fp" + suffix) saveT(gl2, name + "_weight7.fp" + suffix) saveT(bl2, name + "_weight8.fp" + suffix) - if app == 'trc': + if ffn_fp16 == True: saveTensorToHalf(omw, name + "_weight9.fp" + "16") saveTensorToHalf(omb, name + "_weight10.fp" + "16") saveTensorToHalf(opw, name + "_weight11.fp" + "16") - elif app == 'ch': + else: saveT(omw, name + "_weight9.fp" + suffix) saveT(omb, name + "_weight10.fp" + suffix) saveT(opw, name + "_weight11.fp" + suffix) @@ -305,45 +314,87 @@ def transformer_encoder_layer_create(): -def transformer_decoder_layer_create(): - name = "transformer_decoder_layer" +def transformer_decoder_layer_t5_create(): + name = "transformer_decoder_layer_t5" if (post_layernorm): - model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True) + print("post_layernorm true") + model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False) else: - model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num) + print("post_layernorm false") + model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False) hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) - - # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size - # k = model.attention.dense2.weight.asnumpy()#.transpose() - # v = model.attention.dense3.weight.asnumpy()#.transpose() + pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + encoder_pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() + + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + bq = model.attention.dense1.bias.asnumpy() + bk = model.attention.dense2.bias.asnumpy() + bv = model.attention.dense3.bias.asnumpy() + bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + bt =M.Tensor(bw, w_compute_type) + print('encoder_output=',encoder_output) + wp = model.attention.projection.weight + bp = model.attention.projection.bias + + qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size + k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() + v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() - # w = np.concatenate((q, k, v)) # 3xhid_size x hid_size - # w = w.transpose() # hid_size x 3xhid_size - # wt = M.Tensor(w, w_compute_type) - # bq = model.attention.dense1.bias.asnumpy() - # bk = model.attention.dense2.bias.asnumpy() - # bv = model.attention.dense3.bias.asnumpy() - # bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 - # bt =M.Tensor(bw, w_compute_type) - # print('bt=',bt) - # wp = model.attention.projection.weight - # bp = model.attention.projection.bias - - # omw = model.output.mapping.weight - # opw = model.output.projection.weight - # omb = model.output.mapping.bias - # opb = model.output.projection.bias - - # gl1 = model.layernorm1.gamma - # bl1 = model.layernorm1.beta - # gl2 = model.layernorm2.gamma - # bl2 = model.layernorm2.beta + w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size + w2 = w.transpose() # hid_size x 3xhid_size + wt2 = M.Tensor(w2, w_compute_type) + bq2 = model.cross_attention.dense1.bias.asnumpy() + bk2 = model.cross_attention.dense2.bias.asnumpy() + bv2 = model.cross_attention.dense3.bias.asnumpy() + bw2 = np.concatenate((bq2, bk2, bv2)) #(3xhid) X 1 + bt2 =M.Tensor(bw2, w_compute_type) + wp2 = model.cross_attention.projection.weight + bp2 = model.cross_attention.projection.bias + omw = model.output.mapping.weight + opw = model.output.projection.weight + omb = model.output.mapping.bias + opb = model.output.projection.bias + gl1 = model.layernorm1.gamma + bl1 = model.layernorm1.beta + gl2 = model.layernorm2.gamma + bl2 = model.layernorm2.beta + gl3 = model.cross_attention_layernorm.gamma + bl3 = model.cross_attention_layernorm.beta + suffix = str(compute_type) + suffix = suffix[-2:] + + print('qt2=',qt2[0]) + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(bl1, name + "_weight2.fp" + suffix) + saveT(wt, name + "_weight3.fp" + suffix) + saveT(bt, name + "_weight4.fp" + suffix) + saveT(wp, name + "_weight5.fp" + suffix) + saveT(bp, name + "_weight6.fp" + suffix) + saveT(gl2, name + "_weight7.fp" + suffix) + saveT(bl2, name + "_weight8.fp" + suffix) + saveT(qt2, name + "_weight9.fp" + suffix) + saveT(wt2, name + "_weight10.fp" + suffix) + saveT(bt2, name + "_weight11.fp" + suffix) + saveT(wp2, name + "_weight12.fp" + suffix) + saveT(bp2, name + "_weight13.fp" + suffix) + saveT(gl3, name + "_weight14.fp" + suffix) + saveT(bl3, name + "_weight15.fp" + suffix) + saveT(omw, name + "_weight16.fp" + suffix) + saveT(omb, name + "_weight17.fp" + suffix) + saveT(opw, name + "_weight18.fp" + suffix) + saveT(opb, name + "_weight19.fp" + suffix) + suffix = str(compute_type) suffix = suffix[-2:] saveT(hidden_stats, name + "_input1.fp" + suffix) @@ -364,6 +415,123 @@ def transformer_decoder_layer_create(): # # saveTensorToHalf(omb, name + "_weight10.fp" + "16") # # saveTensorToHalf(opw, name + "_weight11.fp" + "16") # # elif app == 'ch': + # saveT(qt2, name + "_weight9.fp" + suffix) + # saveT(wt2, name + "_weight10.fp" + suffix) + # saveT(bt2, name + "_weight11.fp" + suffix) + # saveT(wp2, name + "_weight12.fp" + suffix) + # saveT(bp2, name + "_weight13.fp" + suffix) + # saveT(gl3, name + "_weight14.fp" + suffix) + # saveT(bl3, name + "_weight15.fp" + suffix) + # saveT(omw, name + "_weight16.fp" + suffix) + # saveT(omb, name + "_weight17.fp" + suffix) + # saveT(opw, name + "_weight18.fp" + suffix) + # saveT(opb, name + "_weight19.fp" + suffix) + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask)#, pos, encoder_pos) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask)#, position_bias=pos, encoder_decoder_position_bias = encoder_pos) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') + # if app=="ch": + f_y=open(f'./{name}_output.txt','w') + # # out_name=get_output_encoder_layer(name + "_fwd.mindir") + # # print("name output:",out_name) + saveCalib("output1", np.array(y), f_y)#2 dims + # # print("y.shpae",np.array(y).shape) + # # saveCalib('Default/Add-op267', y, f_y)#2 dims + f_y.close() + # # saveCalib('Default/Reshape-op296', np.array(y), f_y)#2 dims + # # elif app=="trc": + saveT(y, name + "_output1.fp" + suffix) + + +def transformer_decoder_layer_create(): + name = "transformer_decoder_layer" + if (post_layernorm): + print("post_layernorm true") + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True) + else: + print("post_layernorm false") + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num) + hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) + decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() + + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + bq = model.attention.dense1.bias.asnumpy() + bk = model.attention.dense2.bias.asnumpy() + bv = model.attention.dense3.bias.asnumpy() + bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + bt =M.Tensor(bw, w_compute_type) + wp = model.attention.projection.weight + bp = model.attention.projection.bias + + qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size + k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() + v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() + + w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size + w2 = w2.transpose() # hid_size x 3xhid_size + wt2 = M.Tensor(w2, w_compute_type) + bq2 = model.cross_attention.dense1.bias.asnumpy() + bk2 = model.cross_attention.dense2.bias.asnumpy() + bv2 = model.cross_attention.dense3.bias.asnumpy() + bw2 = np.concatenate((bq2, bk2, bv2)) #(3xhid) X 1 + bt2 =M.Tensor(bw2, w_compute_type) + wp2 = model.cross_attention.projection.weight + bp2 = model.cross_attention.projection.bias + omw = model.output.mapping.weight + opw = model.output.projection.weight + omb = model.output.mapping.bias + opb = model.output.projection.bias + + gl1 = model.layernorm1.gamma + bl1 = model.layernorm1.beta + gl2 = model.layernorm2.gamma + bl2 = model.layernorm2.beta + gl3 = model.cross_attention_layernorm.gamma + bl3 = model.cross_attention_layernorm.beta + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(hidden_stats, name + "_input1.fp" + suffix) + saveT(decoder_mask, name + "_input2.fp" + suffix) + saveT(encoder_output, name + "_input3.fp" + suffix) + saveT(memory_mask, name + "_input4.fp" + suffix) + + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(bl1, name + "_weight2.fp" + suffix) + saveT(wt, name + "_weight3.fp" + suffix) + saveT(bt, name + "_weight4.fp" + suffix) + saveT(wp, name + "_weight5.fp" + suffix) + saveT(bp, name + "_weight6.fp" + suffix) + saveT(gl2, name + "_weight7.fp" + suffix) + saveT(bl2, name + "_weight8.fp" + suffix) + saveT(qt2, name + "_weight9.fp" + suffix) + saveT(wt2, name + "_weight10.fp" + suffix) + saveT(bt2, name + "_weight11.fp" + suffix) + saveT(wp2, name + "_weight12.fp" + suffix) + saveT(bp2, name + "_weight13.fp" + suffix) + saveT(gl3, name + "_weight14.fp" + suffix) + saveT(bl3, name + "_weight15.fp" + suffix) + if(ffn_fp16): + saveTensorToHalf(omw, name + "_weight16.fp" + "16") + saveTensorToHalf(omb, name + "_weight17.fp" + "16") + saveTensorToHalf(opw, name + "_weight18.fp" + "16") + else: + saveT(omw, name + "_weight16.fp" + suffix) + saveT(omb, name + "_weight17.fp" + suffix) + saveT(opw, name + "_weight18.fp" + suffix) + saveT(opb, name + "_weight19.fp" + suffix) + # # if app == 'trc': + # # saveTensorToHalf(omw, name + "_weight9.fp" + "16") + # # saveTensorToHalf(omb, name + "_weight10.fp" + "16") + # # saveTensorToHalf(opw, name + "_weight11.fp" + "16") + # # elif app == 'ch': # saveT(omw, name + "_weight9.fp" + suffix) # saveT(omb, name + "_weight10.fp" + suffix) # saveT(opw, name + "_weight11.fp" + suffix) @@ -372,18 +540,18 @@ def transformer_decoder_layer_create(): y = model(hidden_stats, decoder_mask, encoder_output, memory_mask) export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') # if app=="ch": + print('y=',y) + print(y) f_y=open(f'./{name}_output.txt','w') # # out_name=get_output_encoder_layer(name + "_fwd.mindir") # # print("name output:",out_name) - saveCalib("output1", np.array(y[0]), f_y)#2 dims + saveCalib("output1", np.array(y), f_y)#2 dims # # print("y.shpae",np.array(y).shape) # # saveCalib('Default/Add-op267', y, f_y)#2 dims f_y.close() # # saveCalib('Default/Reshape-op296', np.array(y), f_y)#2 dims # # elif app=="trc": - # saveT(y, name + "_output1.fp" + suffix) - - + saveT(y, name + "_output1.fp" + suffix) def build_transformer_encoder_layer_post_ture(): model = TransformerEncoderLayer(batch_size=2, @@ -421,6 +589,7 @@ def test_multihead_attention(): def saveT(t,file): x = t.asnumpy() + print('x=',x) x.tofile(file) def saveTensorToHalf(t,file): @@ -461,8 +630,8 @@ def mha_x1_create(): softmax_compute_type=s_compute_type, app=app ) - q = model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size k = model.dense2.weight.asnumpy()#.transpose() + q = model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size v = model.dense3.weight.asnumpy()#.transpose() w = np.concatenate((q, k, v)) # 3xhid_size x hid_size w = w.transpose() # hid_size x 3xhid_size @@ -729,8 +898,7 @@ def mha_T5_create(): compute_dtype=compute_type, param_init_type=w_compute_type, softmax_compute_type=s_compute_type, - has_bias=False, - app=app + has_bias=False ) print('compute_type',compute_type) q = model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size -- Gitee From ba7f1b51a756c08621ce1df380e488206e15dd5c Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Thu, 29 Dec 2022 12:35:19 +0200 Subject: [PATCH 06/39] add decoder --- .../cpu/kernel/nnacl/infer/infer_register.c | 4 +- mindspore/core/ops/encoder_layer.cc | 11 - mindspore/core/ops/encoder_layer.h | 18 - mindspore/lite/schema/ops.fbs | 9 +- mindspore/lite/src/common/ops/ops_def.cc | 8 +- .../ops/populate/encoder_layer_populate.cc | 6 - .../lite/tools/converter/anf_transform.cc | 2 +- .../optimizer/fusion/decoder_layer_fusion.cc | 16 +- .../optimizer/fusion/encoder_layer_fusion.cc | 61 +- .../optimizer/fusion/encoder_layer_fusion.h | 1 - .../fusion/multi_head_attention_fusion.cc | 19 - trc/transformer/MultiHeadTester.py | 532 ++++++++++++++++-- trc/transformer/convert_fp32.sh | 2 +- trc/transformer/deploy.sh | 10 - trc/transformer/ftBench.py | 66 +-- trc/transformer/models.txt | 14 +- trc/transformer/train_transformer_export.py | 130 +++-- 17 files changed, 620 insertions(+), 289 deletions(-) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c index e847a66ef3e..530f5825b12 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c @@ -43,10 +43,8 @@ #include "nnacl/infer/common_infer.h" #include "nnacl/infer/concat_infer.h" #include "nnacl/infer/constant_of_shape_infer.h" -<<<<<<< HEAD #include "nnacl/infer/decoder_layer_infer.h" -======= ->>>>>>> origin/bert + #ifdef MSLITE_ENABLE_CONTROLFLOW #include "nnacl/infer/control/tensor_array_infer.h" #include "nnacl/infer/control/tensor_array_read_infer.h" diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc index 16cd9fa7fa1..19f40d706e7 100644 --- a/mindspore/core/ops/encoder_layer.cc +++ b/mindspore/core/ops/encoder_layer.cc @@ -57,10 +57,6 @@ int64_t EncoderLayer::get_head_size() const { return GetValue(value_ptr); } -<<<<<<< HEAD - -======= ->>>>>>> origin/bert bool EncoderLayer::get_post_layernorm() const { auto value_ptr = this->GetAttr(kEncoderLayerPostLayernorm); return GetValue(value_ptr); @@ -81,15 +77,8 @@ bool EncoderLayer::get_position_bias() const { auto value_ptr = this->GetAttr(kPositionBias); return GetValue(value_ptr); } - -<<<<<<< HEAD - -void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, - bool position_bias, bool post_layernorm = false) { -======= void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, bool position_bias, bool post_layernorm = false) { ->>>>>>> origin/bert this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); diff --git a/mindspore/core/ops/encoder_layer.h b/mindspore/core/ops/encoder_layer.h index 5150d9d947c..728d02a3576 100644 --- a/mindspore/core/ops/encoder_layer.h +++ b/mindspore/core/ops/encoder_layer.h @@ -45,23 +45,6 @@ class MIND_API EncoderLayer : public BaseOperator { /// \param[in] position_bias Define ffn position_bias. void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, bool position_bias, bool post_layernorm); -<<<<<<< HEAD - void set_head_num(int64_t head_num); - void set_head_size(int64_t head_size); - void set_post_layernorm(bool post_layernorm); - void set_eps_layernorm1(float eps_layernorm1); - void set_eps_layernorm2(float eps_layernorm2); - void set_ffn_hidden_size(int64_t ffn_hidden_size); - void set_position_bias(bool position_bias); - int64_t get_head_num() const; - int64_t get_head_size() const; - bool get_post_layernorm() const; - float get_eps_layernorm1() const; - float get_eps_layernorm2() const; - int64_t get_ffn_hidden_size() const; - bool get_position_bias() const; - }; -======= void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_post_layernorm(bool post_layernorm); @@ -77,7 +60,6 @@ class MIND_API EncoderLayer : public BaseOperator { int64_t get_ffn_hidden_size() const; bool get_position_bias() const; }; ->>>>>>> origin/bert } // namespace ops } // namespace mindspore #endif // MINDSPORE_CORE_OPS_ENCODER_LAYER_H_ diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index 14f6161fd7e..c88aef72469 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -396,6 +396,7 @@ table Attention { head_num: long; head_size: long; cross: bool; + position_bias: bool; } table Conv2DBackpropFilterFusion { @@ -1304,18 +1305,12 @@ table EncoderLayer { head_num: long; head_size: long; post_layernorm: bool; -<<<<<<< HEAD eps_layernorm1: float; eps_layernorm2: float; ffn_hidden_size: long; position_bias: bool; -======= - eps_layernorm1: bool; - eps_layernorm2: bool; - ffn_hidden_size: bool; ->>>>>>> origin/bert } - + table DecoderLayer { head_num: long; head_size: long; diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc index ad7cbf4923b..a1f9edb4bd4 100644 --- a/mindspore/lite/src/common/ops/ops_def.cc +++ b/mindspore/lite/src/common/ops/ops_def.cc @@ -1308,7 +1308,6 @@ OP_SCHEMA_DEF(EncoderLayer) OP_ATTR(head_num, long) OP_ATTR(head_size, long); OP_ATTR(post_layernorm, bool) -<<<<<<< HEAD OP_ATTR(eps_layernorm1, float) OP_ATTR(eps_layernorm2, float) OP_ATTR(ffn_hidden_size, long) @@ -1325,9 +1324,4 @@ OP_ATTR(ffn_hidden_size, long) OP_ATTR(position_bias1, bool) OP_ATTR(position_bias2, bool) OP_SCHEMA_DEF_END(DecoderLayer) -======= -OP_ATTR(eps_layernorm1, bool) -OP_ATTR(eps_layernorm2, bool) -OP_ATTR(ffn_hidden_size, bool) -OP_SCHEMA_DEF_END(EncoderLayer) ->>>>>>> origin/bert + diff --git a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc b/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc index d316f1a063d..12ecd511c3a 100644 --- a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc +++ b/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc @@ -34,16 +34,10 @@ OpParameter *PopulateEncoderLayerParameter(const void *prim) { param->op_parameter_.type_ = primitive->value_type(); param->head_num_ = value->head_num(); param->head_size_ = value->head_size(); -<<<<<<< HEAD param->post_layernorm_ = value->post_layernorm(); param->eps_layernorm1_ = value->eps_layernorm1(); param->eps_layernorm2_ = value->eps_layernorm2(); param->position_bias_ = value->position_bias(); - - - -======= ->>>>>>> origin/bert return reinterpret_cast(param); } diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc index 61bc326bf31..c8868ff6de3 100644 --- a/mindspore/lite/tools/converter/anf_transform.cc +++ b/mindspore/lite/tools/converter/anf_transform.cc @@ -322,7 +322,7 @@ int AnfTransform::RunFusionPass(const FuncGraphPtr &old_graph, const std::shared std::make_shared(), std::make_shared(), std::make_shared()}; -#ifdef ENABLE_CLOUD_FUSION_TRANSFORMER_INFERENCE +#ifdef ENABLE_CLOUD_FUSION_INFERENCE//ENABLE_CLOUD_FUSION_TRANSFORMER_INFERENCE fusions.push_back(std::make_shared()); fusions.push_back(std::make_shared()); fusions.push_back(std::make_shared()); diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 51ace2bf1cc..9b066d80489 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -560,13 +560,13 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq int head_size = 0; float eps1 = 1e-6; float eps2 = 1e-6; - float eps3 = 1e-6; + // float eps3 = 1e-6; bool is_position_bias1 = false; bool is_position_bias2 = false; - if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1, - &is_position_bias2)) { - return nullptr; - } + // if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1, + // &is_position_bias2)) { + // return nullptr; + // } //add eps3 decoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias1, is_position_bias2, post_layernorm); @@ -673,13 +673,7 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq new_node->set_abstract(old_node->abstract()->Clone()); std::cout << "new_node" << std::endl; new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); - // std::cout << new_node->DebugString << std::endl; std::cout << new_node->ToString() << std::endl; - // auto get_item_node = CreateOutputGetItem(func_graph, new_node, 0); - // if (get_item_node == nullptr) { - // MS_LOG(ERROR) << "create decoder_layer output get_item node failed"; - // return nullptr; - // } std::cout << "new_node" << std::endl; return new_node; } diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index 2a4196576fc..c73d1da3d7f 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -187,10 +187,6 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr reshape2 = VectorRef({is_reshape2, add, var2}); matmul1 = VectorRef({is_matmul1, tuple2, weight_m_, bias_m_}); } -<<<<<<< HEAD - -======= ->>>>>>> origin/bert auto act = VectorRef({is_act_, matmul1}); auto is_matmul2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul2"); MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {}); @@ -257,59 +253,16 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi return nullptr; } -<<<<<<< HEAD - -bool IsActGELU(const FuncGraphPtr &func_graph,const EquivPtr &equiv, const VarPtr &input_prim) { - if ((*equiv)[input_prim] == nullptr || !utils::isa((*equiv)[input_prim])) { - printf("is_attention_ is not AnfNodePtr"); - return RET_ERROR; - } - AnfNodePtr node = utils::cast((*equiv)[input_prim]); - MS_ASSERT(node != nullptr); - if (node == nullptr || !utils::isa(node)) { - auto manager = func_graph->manager(); - if (manager == nullptr) { - return RET_ERROR; - } - auto users = manager->node_users(); - auto it = users.find(node); - if (it != users.end()) { - node = it->second.front().first; - } - if (node == nullptr || !utils::isa(node)) { - return RET_ERROR; - } - } - MS_ASSERT(equiv != nullptr && input_prim); - auto act_node = utils::cast(node); - MS_ASSERT(act_node != nullptr); - auto act_input = act_node->input(0); -======= bool EncoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const VarPtr &input_prim) const { auto act_input = GetAttribute(func_graph, equiv, is_act_); ->>>>>>> origin/bert MS_ASSERT(act_input != nullptr); auto act_primitive = ops::GetOperator(act_input); MS_CHECK_TRUE_RET(act_primitive != nullptr, false); auto act_primitive_c = act_primitive->GetPrim(); -<<<<<<< HEAD - if (act_primitive_c->GetAttr(ops::kActivationType) == nullptr || act_primitive->get_activation_type() != mindspore::GELU) { - return false; - } - return true; -} -std::shared_ptr EncoderLayerFusion::BuildEncoderLayerFusionPrim(const EquivPtr &equiv) const { - MS_ASSERT(equiv != nullptr); - auto enoder_layer_prim = std::make_shared(); - if (enoder_layer_prim == nullptr) { - MS_LOG(ERROR) << "Build enoder_layer primitive failed."; - return enoder_layer_prim; -======= if (act_primitive_c->GetAttr(ops::kActivationType) == nullptr || act_primitive->get_activation_type() != mindspore::GELU) { return false; ->>>>>>> origin/bert } return true; } @@ -365,20 +318,8 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { *eps2 = layrn2_prim->get_epsilon(); } -<<<<<<< HEAD - if (!IsActGELU(func_graph,equiv, is_act_)) { - return false; - } - return RET_OK; -} -STATUS EncoderLayerFusion::RemoveRedundantInput(const FuncGraphPtr &func_graph, - const std::vector &redundant) const { - for (auto &node : redundant) { - func_graph->DropNode(node); -======= if (!IsActGELU(func_graph, equiv, is_act_)) { - return false; ->>>>>>> origin/bert + return RET_ERROR; } return RET_OK; } diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index cea160349ea..4f05e809d31 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -65,7 +65,6 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { protected: mutable VarPtr input_{nullptr}; mutable VarPtr position_bias_{nullptr}; - mutable VarPtr is_act_{nullptr}; mutable VarPtr beta1_{nullptr}; mutable VarPtr gamma1_{nullptr}; mutable VarPtr beta2_{nullptr}; diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index 5f58ebd573c..dce7a71632c 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -883,26 +883,7 @@ CNodePtr MultiHeadAttentionFusion::CreateMaskedMultiHeadAttentionNode(const Func AnfNodePtr vnode; auto it_vnode = (*equiv).find(v_transpose_); if (it_vnode != (*equiv).end() && !t5_x_) vnode = utils::cast(it_vnode->second); -<<<<<<< HEAD - if (mask) { - input_mask = utils::cast((*equiv)[mask_]); - } - ShapeVector inputq_shape, inputv_shape; - auto ret = FetchShapeFromAbstract(input_q->abstract(), &inputq_shape); - MS_CHECK_TRUE_RET(ret == RET_OK, nullptr); - ret = FetchShapeFromAbstract(input_v->abstract(), &inputv_shape); - MS_CHECK_TRUE_RET(ret == RET_OK, nullptr); - // test for cross - std::cout << "match_count_" << match_count_ << std::endl; - std::cout << "input_q != input_v" << (input_q != input_v) << std::endl; - if ((inputq_shape != inputv_shape) || (input_q != input_v)){ - std::cout << "cross = true"<< std::endl; - cross = true; - } - if (!cross && !t5_x_) { -======= if (!cross && !t5_x_) { ->>>>>>> origin/bert redundant.push_back(bias_q); } tensor::TensorPtr c_weights, q_weight_t; diff --git a/trc/transformer/MultiHeadTester.py b/trc/transformer/MultiHeadTester.py index bfc72ef3123..d04f3bfa842 100644 --- a/trc/transformer/MultiHeadTester.py +++ b/trc/transformer/MultiHeadTester.py @@ -24,7 +24,8 @@ __all__ = [ "MultiHeadAttentionX", "FeedForwardX", "TransformerEncoderLayerX", - "_LayerNormX" + "_LayerNormX", + "TransformerDecoderLayerX" ] @@ -44,7 +45,7 @@ class _LayerNormX(Cell): Tensor of shape :math:`(batch, seq_length, hidden_size)`. """ - def __init__(self, normalized_shape, eps=1e-4, param_init_type=mstype.float32, is_self_defined=True): + def __init__(self, normalized_shape, eps=1e-4, param_init_type=mstype.float32, is_self_defined=False): super(_LayerNormX, self).__init__() if param_init_type not in [mstype.float32, mstype.float16]: raise TypeError("The type of parameter 'param_init_type' should in [float32, float16], " @@ -540,39 +541,39 @@ class MultiHeadAttentionX(Cell): # # key and value for current token(s) key_present = key value_present = value - # if self.use_past: - # # The first graph with the input size of (bs, seq_length) - # if self.is_first_iteration: - # # Get the valid input length without padding - # valid_length_vector = F.cast(self.less(self.range, batch_valid_length.view(-1, 1, 1)), self.dtype) - # # Cover the key and value numbers corresponding to the padding position - # key_present = self.mul1(key, self.expand_dims(valid_length_vector, 2)) - # value_present = self.mul1(value, self.expand_dims(valid_length_vector, 3)) - # # The second graph with the inpus size of (bs, 1) - # # the shape of query is (bs, num_heads, 1, size_per_head) - # # the shape of key is (bs, num_heads, size_per_head, 1) - # # the shape of value is (bs, num_heads, 1, size_per_head) - # else: - # # Get the current token position index - # valid_length = self.reducesum(F.cast(self.not_equal(self.slice(key_past, (0, 0, 0, 0), - # (F.shape(key_tensor)[0], 1, 1, - # self.src_seq_length), - # (1, 1, 1, 1)), - # 0), mstype.float32), (1, 2, 3)) - # valid_length = F.reshape(valid_length, (-1, 1, 1)) - # valid_length_vector = F.cast(self.equal(valid_length, self.range), self.dtype) - # # Pad the key and value to seq_length with only the position index not zero - # current_key = self.mul1(self.tile(key, (1, 1, 1, self.seq_length)), - # self.expand_dims(valid_length_vector, 2)) - # current_value = self.mul1(self.tile(value, (1, 1, self.seq_length, 1)), - # self.expand_dims(valid_length_vector, 3)) - # # Concat the previous saved state and current state - # key = self.add(key_past, current_key) - # value = self.add(value_past, current_value) - # # Update key_present and value_present for state update - # key_present = key - # value_present = value - # attention_mask = F.reshape(self.attention_mask, (self.seq_length, self.seq_length, 1, 1)) + if self.use_past: + # The first graph with the input size of (bs, seq_length) + if self.is_first_iteration: + # Get the valid input length without padding + valid_length_vector = F.cast(self.less(self.range, batch_valid_length.view(-1, 1, 1)), self.dtype) + # Cover the key and value numbers corresponding to the padding position + key_present = self.mul1(key, self.expand_dims(valid_length_vector, 2)) + value_present = self.mul1(value, self.expand_dims(valid_length_vector, 3)) + # The second graph with the inpus size of (bs, 1) + # the shape of query is (bs, num_heads, 1, size_per_head) + # the shape of key is (bs, num_heads, size_per_head, 1) + # the shape of value is (bs, num_heads, 1, size_per_head) + else: + # Get the current token position index + valid_length = self.reducesum(F.cast(self.not_equal(self.slice(key_past, (0, 0, 0, 0), + (F.shape(key_tensor)[0], 1, 1, + self.src_seq_length), + (1, 1, 1, 1)), + 0), mstype.float32), (1, 2, 3)) + valid_length = F.reshape(valid_length, (-1, 1, 1)) + valid_length_vector = F.cast(self.equal(valid_length, self.range), self.dtype) + # Pad the key and value to seq_length with only the position index not zero + current_key = self.mul1(self.tile(key, (1, 1, 1, self.seq_length)), + self.expand_dims(valid_length_vector, 2)) + current_value = self.mul1(self.tile(value, (1, 1, self.seq_length, 1)), + self.expand_dims(valid_length_vector, 3)) + # Concat the previous saved state and current state + key = self.add(key_past, current_key) + value = self.add(value_past, current_value) + # Update key_present and value_present for state update + key_present = key + value_present = value + attention_mask = F.reshape(self.attention_mask, (self.seq_length, self.seq_length, 1, 1)) layer_present = (key_present, value_present) # # multi head attention considering attention mask @@ -1309,6 +1310,7 @@ class TransformerEncoderLayerX(Cell): else: input_x = self.layernorm1(x) input_x = F.cast(input_x, self.dtype) + # indicate whether reset saved states key_reset = None value_reset = None @@ -1415,3 +1417,463 @@ class TransformerEncoderLayerX(Cell): _check_input_dtype(F.dtype(batch_valid_length), "batch_valid_length", [mstype.int32], self.cls_name) return True +class TransformerDecoderLayerX(Cell): + r""" + Transformer Decoder Layer. This is an implementation of the single layer of the transformer + decoder layer, including self-attention, cross attention and feedward layer. When the encoder_output is None, + the cross attention will not be effective. + + Args: + hidden_size(int): The hidden size of the input. + ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer. + num_heads(int): The number of the heads. + batch_size(int): The batch size of the input tensor. + src_seq_length(int): The input source sequence length. + tgt_seq_length(int): The input target sequence length. + attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1. + hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1. + post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False. + use_past(bool): Use the past state to compute, used for incremental prediction. Default False. + layernorm_compute_type(dtype.Number): The computation type of the layernorm. + Should be dtype.float32 or dtype.float16. Default dtype.float32. + softmax_compute_type(dtype.Number): The computation type of the softmax in the attention. + Should be dtype.float32 or dtype.float16. Default mstype.float32. + param_init_type(dtype.Number): The parameter initialization type of the module. + Should be dtype.float32 or dtype.float16. Default dtype.float32. + hidden_act(str): The activation of the internal feedforward layer. Supports 'relu', + 'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish', + 'hsigmoid', 'logsigmoid' and so on. Default: gelu. + moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). Default is an instance of MoEConfig + with default values. Please see `MoEConfig`. + parallel_config(OpParallelConfig, MoEParallelConfig): The parallel configure. When MoE is applied, + MoEParallelConfig is effective, otherwise OpParallelConfig is effective. Default `default_dpmp_config`, + an instance of `OpParallelConfig` with default args. + + Inputs: + - **hidden_stats** (Tensor) - The input tensor with shape [batch_size, tgt_seq_length, hidden_size] or + [batch_size * tgt_seq_length, hidden_size]. + - **decoder_mask** (Tensor) - The attention mask for decoder with shape [batch_size, src_seq_length, + seq_length]. + - **encoder_output** (Tensor) - The output of the encoder with shape [batch_size, seq_length, hidden_size] + or [batch_size * seq_length, hidden_size]. + Note this args can not be passed by None when the net is in outermost layer. Default None. + - **memory_mask** (Tensor) - The memory mask of the cross attention with shape [batch, tgt_seq_length, + src_seq_length] where tgt_seq_length is the length of the decoder. Note this args can not be passed by + None when the net is in outermost layer. Default None. + - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and + past value parameter used in the incremental prediction. Only valid when use_past is True. Default True. + - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. + Used for incremental prediction when the use_past is True. Default None. + + Outputs: + Tuple, a tuple contains(`output`, `layer_present`) + + - **output** (Tensor) - The output logit of this layer. The shape is [batch, seq_length, hidden_size] or + [batch * seq_length, hidden_size]. + - **layer_present** (Tuple) - A tuple, where each tuple is the tensor of the projected key and value + vector in self attention with shape ((batch_size, num_heads, size_per_head, tgt_seq_length), + (batch_size, num_heads, tgt_seq_length, size_per_head), and of the projected key and value vector + in cross attention with shape (batch_size, num_heads, size_per_head, src_seq_length), + (batch_size, num_heads, src_seq_length, size_per_head)). + + Supported Platforms: + ``Ascend`` ``GPU`` + + Examples: + >>> import numpy as np + >>> from mindspore import dtype as mstype + >>> from mindspore.nn.transformer import TransformerDecoderLayer + >>> from mindspore import Tensor + >>> model = TransformerDecoderLayer(batch_size=2, hidden_size=64, ffn_hidden_size=64, num_heads=2, + ... src_seq_length=20, tgt_seq_length=10) + >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32) + >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32) + >>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16) + >>> memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16) + >>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask) + >>> print(output.shape) + (2, 10, 64) + >>> print(past[0].shape) + (2, 2, 32, 10) + >>> print(past[1].shape) + (2, 2, 10, 32) + >>> print(past[2].shape) + (2, 2, 32, 20) + >>> print(past[3].shape) + (2, 2, 20, 32) + """ + # @_LogActionOnce(logger=logger, key='TransformerDecoderLayer', + # no_warning=_get_parallel_mode() in (ParallelMode.STAND_ALONE,)) + @_args_type_validator_check(batch_size=Validator.check_positive_int, + hidden_size=Validator.check_positive_int, + num_heads=Validator.check_positive_int, + ffn_hidden_size=Validator.check_positive_int, + src_seq_length=Validator.check_positive_int, + tgt_seq_length=Validator.check_positive_int, + attention_dropout_rate=Validator.check_non_negative_float, + hidden_dropout_rate=Validator.check_non_negative_float, + hidden_act=_valid_type_checks([str], "TransformerDecoderLayer"), + post_layernorm_residual=Validator.check_bool, + layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16], + "TransformerDecoderLayer"), + softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16], + "TransformerDecoderLayer"), + param_init_type=_valid_value_checks([mstype.float32, mstype.float16], + "TransformerDecoderLayer"), + parallel_config=_valid_type_checks([OpParallelConfig, MoEParallelConfig], + "TransformerDecoderLayer"), + use_past=Validator.check_bool) + def __init__(self, hidden_size, + ffn_hidden_size, + num_heads, + batch_size, + src_seq_length, + tgt_seq_length, + attention_dropout_rate=0.1, + hidden_dropout_rate=0.1, + post_layernorm_residual=False, + use_past=False, + layernorm_compute_type=mstype.float32, + softmax_compute_type=mstype.float32, + param_init_type=mstype.float32, + hidden_act='gelu', + moe_config=default_moe_config, + parallel_config=default_dpmp_config): + super(TransformerDecoderLayerX, self).__init__() + _check_moe_config(moe_config, parallel_config) + self.use_moe = (moe_config.expert_num > 1) + config_to_attention = parallel_config.dpmp if self.use_moe else parallel_config + if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation(): + _check_config(parallel_config) + if num_heads % parallel_config.model_parallel != 0: + raise ValueError("For 'TransformerDecoderLayer', the class variable 'num_heads' must be divisibled by " + "'parallel_config.model_parallel', but got the num_heads is {} and " + "parallel_config.model_parallel is {}.".format(num_heads, + parallel_config.model_parallel)) + if hidden_size % parallel_config.model_parallel != 0: + raise ValueError( + "For 'TransformerDecoderLayer', the class variable 'hidden_size' must be divisibled by " + "'parallel_config.model_parallel', but got the hidden_size is {} and " + "parallel_config.model_parallel is {}." + .format(hidden_size, parallel_config.model_parallel)) + if ffn_hidden_size % parallel_config.model_parallel != 0: + raise ValueError("For 'TransformerDecoderLayer', the class variable 'ffn_hidden_size' must be " + "divisibled by 'parallel_config.model_parallel', but got the ffn_hidden_size is {} " + "and parallel_config.model_parallel is {}." + .format(ffn_hidden_size, parallel_config.model_parallel)) + if use_past: + raise ValueError(f"The {self.cls_name} does not support use_past=True.") + self.batch_size = batch_size + self.use_past = use_past + self.softmax_compute_type = softmax_compute_type + + self.src_seq_length = src_seq_length + self.tgt_seq_length = tgt_seq_length + self.use_past = use_past + self.hidden_size = hidden_size + + self.layernorm1 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type) + self.layernorm2 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type) + self.attention = MultiHeadAttentionX(hidden_size=hidden_size, + num_heads=num_heads, + batch_size=batch_size, + src_seq_length=tgt_seq_length, + tgt_seq_length=tgt_seq_length, + hidden_dropout_rate=hidden_dropout_rate, + attention_dropout_rate=attention_dropout_rate, + use_past=use_past, + softmax_compute_type=softmax_compute_type, + param_init_type=param_init_type, + parallel_config=config_to_attention) + + # Cross attention with the output of encoder as memory tensor + self.cross_attention = MultiHeadAttentionX(hidden_size=hidden_size, + num_heads=num_heads, + batch_size=batch_size, + src_seq_length=tgt_seq_length, + tgt_seq_length=src_seq_length, + hidden_dropout_rate=hidden_dropout_rate, + attention_dropout_rate=attention_dropout_rate, + softmax_compute_type=softmax_compute_type, + use_past=use_past, + param_init_type=param_init_type, + parallel_config=config_to_attention) + self.cross_attention_layernorm = _LayerNormX((hidden_size,)).to_float( + layernorm_compute_type) + + if self.use_moe: + self.output = MoE(hidden_size=hidden_size, + dropout_rate=hidden_dropout_rate, + ffn_hidden_size=ffn_hidden_size, + param_init_type=param_init_type, + hidden_act=hidden_act, + moe_config=moe_config, + parallel_config=parallel_config) + else: + # Feed Forward Network, FFN + self.output = FeedForwardX(hidden_size=hidden_size, + dropout_rate=hidden_dropout_rate, + ffn_hidden_size=ffn_hidden_size, + hidden_act=hidden_act, + param_init_type=param_init_type, + parallel_config=parallel_config) + self.post_layernorm_residual = post_layernorm_residual + self.add = P.Add() + self.add_3d = P.Add() + self.dtype = mstype.float16 + self.key_past = None + self.value_past = None + if self.use_past: + # operator used for state reuse + self.reducesum = P.ReduceSum().shard(((1, 1, 1, 1),)) + self.not_equal = P.NotEqual().shard(((1, 1, 1, 1), ())) + self.slice = P.StridedSlice().shard(((1, 1, 1, 1),)) + size_per_head = hidden_size // num_heads + self.key_shape = (batch_size, num_heads, size_per_head, tgt_seq_length) + self.value_shape = (batch_size, num_heads, tgt_seq_length, size_per_head) + # parameters saving key and value states + self.key_past = Parameter(Tensor(np.zeros(shape=self.key_shape), self.dtype), name="key_past") + self.value_past = Parameter(Tensor(np.zeros(shape=self.value_shape), self.dtype), name="value_past") + self.tile = P.Tile().shard(((1, 1),)) + self.mul = P.Mul().shard(((1, 1, 1, 1), (1,))) + self.assign = P.Assign().shard(((1, 1, 1, 1), (1, 1, 1, 1))) + elif _get_parallel_mode() not in (ParallelMode.AUTO_PARALLEL,): + _check_config(parallel_config) + if num_heads % parallel_config.model_parallel != 0: + raise ValueError("For 'TransformerDecoderLayer', the class variable 'num_heads' must be divisibled by " + "'parallel_config.model_parallel', but got the num_heads is {} and " + "parallel_config.model_parallel is {}.".format(num_heads, + parallel_config.model_parallel)) + if hidden_size % parallel_config.model_parallel != 0: + raise ValueError( + "For 'TransformerDecoderLayer', the class variable 'hidden_size' must be divisibled by " + "'parallel_config.model_parallel', but got the hidden_size is {} and " + "parallel_config.model_parallel is {}." + .format(hidden_size, parallel_config.model_parallel)) + if ffn_hidden_size % parallel_config.model_parallel != 0: + raise ValueError("For 'TransformerDecoderLayer', the class variable 'ffn_hidden_size' must be " + "divisibled by 'parallel_config.model_parallel', but got the ffn_hidden_size is {} " + "and parallel_config.model_parallel is {}." + .format(ffn_hidden_size, parallel_config.model_parallel)) + if use_past: + raise ValueError(f"The {self.cls_name} does not support use_past=True.") + self.batch_size = batch_size + self.use_past = use_past + self.softmax_compute_type = softmax_compute_type + + self.src_seq_length = src_seq_length + self.tgt_seq_length = tgt_seq_length + self.use_past = use_past + self.hidden_size = hidden_size + + self.layernorm1 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type) + self.layernorm1.shard(((parallel_config.data_parallel, 1),)) + self.layernorm2 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type) + self.layernorm2.shard(((parallel_config.data_parallel, 1),)) + self.attention = MultiHeadAttentionX(hidden_size=hidden_size, + num_heads=num_heads, + batch_size=batch_size, + src_seq_length=tgt_seq_length, + tgt_seq_length=tgt_seq_length, + hidden_dropout_rate=hidden_dropout_rate, + attention_dropout_rate=attention_dropout_rate, + use_past=use_past, + softmax_compute_type=softmax_compute_type, + param_init_type=param_init_type, + parallel_config=config_to_attention) + + # Cross attention with the output of encoder as memory tensor + self.cross_attention = MultiHeadAttentionX(hidden_size=hidden_size, + num_heads=num_heads, + batch_size=batch_size, + src_seq_length=tgt_seq_length, + tgt_seq_length=src_seq_length, + hidden_dropout_rate=hidden_dropout_rate, + attention_dropout_rate=attention_dropout_rate, + softmax_compute_type=softmax_compute_type, + use_past=use_past, + param_init_type=param_init_type, + parallel_config=config_to_attention) + self.cross_attention_layernorm = _LayerNormX((hidden_size,)).to_float( + layernorm_compute_type) + self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),)) + + if self.use_moe: + self.output = MoE(hidden_size=hidden_size, + dropout_rate=hidden_dropout_rate, + ffn_hidden_size=ffn_hidden_size, + param_init_type=param_init_type, + hidden_act=hidden_act, + moe_config=moe_config, + parallel_config=parallel_config) + else: + # Feed Forward Network, FFN + self.output = FeedForwardX(hidden_size=hidden_size, + dropout_rate=hidden_dropout_rate, + ffn_hidden_size=ffn_hidden_size, + hidden_act=hidden_act, + param_init_type=param_init_type, + parallel_config=parallel_config) + self.post_layernorm_residual = post_layernorm_residual + self.add = P.Add().shard(((parallel_config.data_parallel, 1), (parallel_config.data_parallel, 1))) + self.add_3d = P.Add().shard(((parallel_config.data_parallel, 1, 1), (parallel_config.data_parallel, 1, 1))) + self.dtype = mstype.float16 + self.key_past = None + self.value_past = None + if self.use_past: + # operator used for state reuse + self.reducesum = P.ReduceSum().shard(((1, 1, 1, 1),)) + self.not_equal = P.NotEqual().shard(((1, 1, 1, 1), ())) + self.slice = P.StridedSlice().shard(((1, 1, 1, 1),)) + size_per_head = hidden_size // num_heads + self.key_shape = (batch_size, num_heads, size_per_head, tgt_seq_length) + self.value_shape = (batch_size, num_heads, tgt_seq_length, size_per_head) + # parameters saving key and value states + self.key_past = Parameter(Tensor(np.zeros(shape=self.key_shape), self.dtype), name="key_past") + self.value_past = Parameter(Tensor(np.zeros(shape=self.value_shape), self.dtype), name="value_past") + self.tile = P.Tile().shard(((1, 1),)) + self.mul = P.Mul().shard(((1, 1, 1, 1), (1,))) + self.assign = P.Assign().shard(((1, 1, 1, 1), (1, 1, 1, 1))) + else: + raise RuntimeError(f"The {self.cls_name} only support sharding propagation or " + f"semi-auto parallel mode now.") + + def construct(self, hidden_stats, + decoder_mask, + encoder_output=None, + memory_mask=None, + init_reset=True, batch_valid_length=None): + self._check_input(hidden_stats, decoder_mask, encoder_output, memory_mask, init_reset, batch_valid_length) + # the returned shape is [bs, seq_length, embedding_size] or [bs * seq_length, embedding_size] + hidden_shape = F.shape(hidden_stats) + hidden_stats = F.reshape(hidden_stats, (-1, hidden_shape[-1])) + input_x = self.layernorm1(hidden_stats) + + input_x = F.cast(input_x, self.dtype) + # indicate whether reset saved states + key_reset = None + value_reset = None + if self.use_past: + # reset states, init_reset True for reuse and False for reset + key_reset = self.assign(self.key_past, self.mul(self.key_past, F.cast(init_reset, self.dtype))) + value_reset = self.assign(self.value_past, self.mul(self.value_past, F.cast(init_reset, self.dtype))) + # add dependency for desired execution order + input_x = F.depend(input_x, key_reset) + input_x = F.depend(input_x, value_reset) + + attention, layer_present = self.attention(input_x, input_x, input_x, decoder_mask, self.key_past, + self.value_past, batch_valid_length) + # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm + if self.post_layernorm_residual: + x = self.add(input_x, attention) + # For pre-layernorm the inputs for residual path are output of self-attention and input of this layer + else: + x = self.add(hidden_stats, attention) + middle_output = None + cross_attn_output = None + if encoder_output is not None: + middle_output = self.cross_attention_layernorm(x) + + middle_output = F.cast(middle_output, self.dtype) + encoder_output = F.cast(encoder_output, self.dtype) + cross_attn_output, cross_layer_present = self.cross_attention(middle_output, encoder_output, + encoder_output, + memory_mask, self.key_past, + self.value_past, batch_valid_length) + layer_present += cross_layer_present + if self.post_layernorm_residual: + x = self.add(middle_output, cross_attn_output) + else: + x = self.add(x, cross_attn_output) + + output_x = self.layernorm2(x) + output_x = F.cast(output_x, self.dtype) + aux_loss = None + if self.use_moe: + mlp_logit, aux_loss = self.output(output_x) + else: + mlp_logit = self.output(output_x) + # return mlp_logit + + value_update = None + key_update = None + if self.use_past: + # current key and value + key_present, value_present = layer_present + # update key and value calculated this step + key_update = self.assign(self.key_past, key_present) + value_update = self.assign(self.value_past, value_present) + # add dependency for desired execution order + key_update = F.depend(key_update, key_reset) + value_update = F.depend(value_update, value_reset) + + # add dependency for desired execution order + mlp_logit = F.depend(mlp_logit, value_update) + mlp_logit = F.depend(mlp_logit, key_update) + + # if shape is 3d, we reshape the inputs of the add + if len(hidden_shape) == 3: + output_x = P.Reshape()(output_x, hidden_shape) + mlp_logit = P.Reshape()(mlp_logit, hidden_shape) + x = P.Reshape()(x, hidden_shape) + + if self.post_layernorm_residual: + output = self.add_3d(output_x, mlp_logit) + else: + output = self.add_3d(x, mlp_logit) + else: + if self.post_layernorm_residual: + output = self.add(output_x, mlp_logit) + else: + output = self.add(x, mlp_logit) + output = F.reshape(output, hidden_shape) + + if self.use_moe: + return output#, layer_present, aux_loss + return output#, layer_present + + def _check_input(self, hidden_states, attention_mask, encoder_output, memory_mask, init_reset, batch_valid_length): + r"""Check inputs""" + if not self.use_past or (self.use_past and self.is_first_iteration): + _check_shape_equal(F.shape(hidden_states), "hidden_states", self.cls_name, + [[self.batch_size, self.tgt_seq_length, self.hidden_size], + [self.batch_size * self.tgt_seq_length, self.hidden_size]]) + _check_shape_equal(F.shape(attention_mask), "attention_mask", self.cls_name, + [self.batch_size, self.tgt_seq_length, self.tgt_seq_length]) + + else: + _check_shape_equal(F.shape(hidden_states), "hidden_states", self.cls_name, + [self.batch_size, 1, self.hidden_size]) + _check_shape_equal(F.shape(attention_mask), "attention_mask", self.cls_name, + [self.batch_size, 1, self.tgt_seq_length]) + _check_input_dtype(F.dtype(hidden_states), "hidden_states", [mstype.float32, mstype.float16], self.cls_name) + _check_input_dtype(F.dtype(attention_mask), "attention_mask", [mstype.float32, mstype.float16], self.cls_name) + if encoder_output is not None: + _check_shape_equal(F.shape(encoder_output), "encoder_output", self.cls_name, + [[self.batch_size, self.src_seq_length, self.hidden_size], + [self.batch_size * self.src_seq_length, self.hidden_size]]) + _check_input_dtype(F.dtype(encoder_output), "encoder_output", + [mstype.float32, mstype.float16], self.cls_name) + if memory_mask is not None: + _check_shape_equal(F.shape(memory_mask), "memory_mask", self.cls_name, + [self.batch_size, self.tgt_seq_length, self.src_seq_length]) + _check_input_dtype(F.dtype(memory_mask), "memory_mask", + [mstype.float32, mstype.float16], self.cls_name) + + init_reset_is_tensor = isinstance(init_reset, Tensor) + init_reset_is_default = init_reset is True + batch_valid_length_is_tensor = isinstance(batch_valid_length, Tensor) + batch_is_default = batch_valid_length is None + _check_past_none_input_none(self.use_past, "init_reset", self.cls_name, True, init_reset_is_tensor, + init_reset_is_default) + _check_past_none_input_none(self.use_past, "batch_valid_length", self.cls_name, None, + batch_valid_length_is_tensor, batch_is_default) + + if self.use_past: + _check_shape_equal(F.shape(init_reset), "init_reset", self.cls_name, [1]) + _check_input_dtype(F.dtype(init_reset), "init_reset", [mstype.bool_], self.cls_name) + _check_shape_equal(F.shape(batch_valid_length), "batch_valid_length", self.cls_name, [self.batch_size]) + _check_input_dtype(F.dtype(batch_valid_length), "batch_valid_length", [mstype.int32], self.cls_name) + return True + + diff --git a/trc/transformer/convert_fp32.sh b/trc/transformer/convert_fp32.sh index 25faa9d449c..af5b5b1d851 100755 --- a/trc/transformer/convert_fp32.sh +++ b/trc/transformer/convert_fp32.sh @@ -2,7 +2,7 @@ base=`git rev-parse --show-toplevel` version=$(cat ${base}/version.txt) file_name=$(basename $1) file_name="${file_name%.*}" -# dbg="gdb --args " +#dbg="gdb --args " #GLOG_v=0 \ lib_base=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index fa685ec084d..e08f9803b80 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -4,10 +4,7 @@ version=$(cat ${base}/version.txt) system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 benchmark=${system}/tools/benchmark/benchmark server=caspi -<<<<<<< HEAD gpu_id=2 -======= -gpu_id=0 while getopts "c" opt ; do case "${opt}" in c) @@ -17,7 +14,6 @@ while getopts "c" opt ; do esac done shift $(($OPTIND - 1)) ->>>>>>> origin/bert # move files to caspi model=${1%.mindir} model=${model#convv_} @@ -35,14 +31,8 @@ then fi echo "batch_size=${batch_size}" echo "model_name=${model_name}" -<<<<<<< HEAD -echo "model=${model}" dir1=$(dirname $(realpath $1)) ssh ${server} "mkdir -p ${dir1}" -======= -dir=$(dirname $(realpath $1)) -ssh ${server} "mkdir -p ${dir}" ->>>>>>> origin/bert dir=$(dirname ${benchmark}) ssh ${server} "mkdir -p ${dir}" dir=${system}/runtime/lib diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 73692e4b5e5..d1dfb4e4f37 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -98,35 +98,6 @@ for line_model_arg in models_arg: input_files='' output_file='' # os.system(f"./convert_fp32.sh {model_name}_fwd.mindir") -<<<<<<< HEAD - # find_output_name(f'convv_{model_name}_fwd_graph.ms', f'{model_name}_output.txt') - # if app=='ch': - # ret=0 - # if act == 'be': - # input_files=' '.join([work_dir+'/'+str(line)[4:-1] for line in subprocess.check_output(f"find . -iname '{model_name}_input*.{suffix}'", shell=True).splitlines()]) - # delim=" " - # input_files = delim.join(sorted(input_files.split(' '))) - # output_file=f'{work_dir}/{model_name}_output.txt' - # benchmark_args=f'--modelFile={work_dir}/convv_{model_name}_fwd.mindir --loopCount={loop_count} --modelType=MindIR --inDataFile="{input_files}" --benchmarkDataFile={output_file} --device=GPU --enableFp16={enable_fp16}' - # os.system(f"./convert_fp32.sh {model_name}_fwd_graph.mindir") - # find_output_name(f'convv_{model_name}_fwd_graph.ms', f'{model_name}_output.txt') - # if server == 'local': - # os.system('./trc/release.sh x86') - # os.system(f"cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}" ) - # else: - # with open(f'cfg_{model_name}.config','w') as f: - # if model_name == 'bert': - # f.write(f"[gpu_context]\ninput_shape=input_ids:[{batch_size},{seq}];token_type_ids:[{batch_size},{seq}];input_mask:[{batch_size},{seq}]") - # elif model_name == 'transformer_encoder_layer': - # f.write(f"[gpu_context]\ninput_shape=x:[{batch_size},{seq},{hidden_size}];input_mask:[{batch_size},{seq},{seq}]") - # os.system(f"ssh {server} 'rm -f {system}/../mindspore-lite-{version}-linux-x64.tar.gz {work_dir}/*{model_name}*'") - # os.system(f"ssh {server} 'mkdir -p {benchmark}'") - # os.system(f"rsync -v {system}/../mindspore-lite-{version}-linux-x64.tar.gz {server}:{system}/..") - # os.system(f"ssh {server} 'cd {system}/.. && tar -xzf {system}/../mindspore-lite-{version}-linux-x64.tar.gz'") - # os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") - # os.system(f"./deploy.sh convv_{model_name}_fwd_graph.mindir") - # os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) -======= # find_output_name(f'convv_{model_name}_fwd.mindir', f'{model_name}_output.txt') if app=='ch': ret=0 @@ -155,24 +126,23 @@ for line_model_arg in models_arg: os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") #os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) ->>>>>>> origin/bert - # elif app=='trc': - # #if loop count =1 app=be else app = runtime - # line_model_arg=line_model_arg[:-1] - # line_model_arg=line_model_arg.split()[:-2] - # line_model_arg=" ".join(line_model_arg) - # # +' -L '+str(loop_count) - # if server=='local': - # print("run trc local") - # os.system(f"rsync -v {base}/trc/transformer/{model_name}* {base}/../FasterTransformer/build/bin" ) - # os.system(f"cd {base}/../FasterTransformer/build/bin && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={base}/../FasterTransformer:/usr/local/cuda-11.7/lib64 ./ms_benchmark {line_model_arg}" ) - # else: - # print("run trc caspi") - # print("line model arg=", line_model_arg) - # os.system(f"ssh {server} 'rm -f {base}/../FasterTransformer/build/bin/ms_benchmark {base}/../FasterTransformer/build/bin/{model_name}*'") - # os.system(f"rsync -v {base}/../FasterTransformer/build/bin/ms_benchmark {server}:{base}/../FasterTransformer/build/bin/ms_benchmark" ) - # os.system(f"rsync -v {base}/trc/transformer/{model_name}* {server}:{base}/../FasterTransformer/build/bin" ) - # os.system(f'rsync -v {base}/../FasterTransformer/build/lib/libtransformer-shared.so caspi:{base}/../FasterTransformer/build/lib/.') - # os.system(f"ssh {server} 'cd {base}/../FasterTransformer/build/bin && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={base}/../FasterTransformer:/usr/local/cuda-11.7/lib64 ./ms_benchmark {line_model_arg}' " ) + elif app=='trc': + #if loop count =1 app=be else app = runtime + line_model_arg=line_model_arg[:-1] + line_model_arg=line_model_arg.split()[:-2] + line_model_arg=" ".join(line_model_arg) + # +' -L '+str(loop_count) + if server=='local': + print("run trc local") + os.system(f"rsync -v {base}/trc/transformer/{model_name}* {base}/../FasterTransformer/build/bin" ) + os.system(f"cd {base}/../FasterTransformer/build/bin && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={base}/../FasterTransformer:/usr/local/cuda-11.7/lib64 ./ms_benchmark {line_model_arg}" ) + else: + print("run trc caspi") + print("line model arg=", line_model_arg) + os.system(f"ssh {server} 'rm -f {base}/../FasterTransformer/build/bin/ms_benchmark {base}/../FasterTransformer/build/bin/{model_name}*'") + os.system(f"rsync -v {base}/../FasterTransformer/build/bin/ms_benchmark {server}:{base}/../FasterTransformer/build/bin/ms_benchmark" ) + os.system(f"rsync -v {base}/trc/transformer/{model_name}* {server}:{base}/../FasterTransformer/build/bin" ) + os.system(f'rsync -v {base}/../FasterTransformer/build/lib/libtransformer-shared.so caspi:{base}/../FasterTransformer/build/lib/.') + os.system(f"ssh {server} 'cd {base}/../FasterTransformer/build/bin && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={base}/../FasterTransformer:/usr/local/cuda-11.7/lib64 ./ms_benchmark {line_model_arg}' " ) diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 98e2ab1f487..cf9101ff370 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -16,16 +16,11 @@ #-b 16 -l 24 -H 16 -S 1024 -s 512 -P 1 -m bert #-b 32 -l 24 -H 16 -S 1024 -s 512 -P 1 -m bert -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -H 4 -S 1024 -p 0 -m mha_x1 -<<<<<<< HEAD -#-b 1 -l 6 -s 8 -H 8 -S 1024 - 0 -m T5 -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -m transformer_encoder_layer -======= #-b 1 -l 6 -s 128 -H 8 -S 1024 -m T5 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer @@ -45,7 +40,7 @@ #-b 1 -l 12 -H 12 -S 768 -s 128 -m bert # ------------------------- Tests coverage ----------------------------------- --b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 +#-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 66 -s 20 -t 30 -H 3 -S 15 -p 0 -m mha_cross #-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 #-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross @@ -58,9 +53,8 @@ #-b 1 -l 12 -H 12 -S 768 -s 128 -m bert #-b 8 -l 12 -H 4 -S 512 -s 64 -m bert # ----------------------------------------------------------------------------- ->>>>>>> origin/bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert #-b 1 -l 12 -H 2 -S 8 -s 20 -m T5 #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert @@ -79,7 +73,7 @@ #-b 32 -l 24 -H 16 -S 1024 -s 128 -m bert #-b 8 -l 24 -H 16 -S 1024 -s 128 -m bert # -s 64 128 512 1024 -# -b 1 16 32 +#-b 1 16 32 ##-s 128 -H 4 8 -S 1024 2048 #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m test #-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m test -T fp32 -W fp32 -F fp32 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 8c599fca396..49fe8bb9485 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -337,63 +337,91 @@ def transformer_encoder_layer_create(): def transformer_decoder_layer_create(): + ffn_fp16=False name = "transformer_decoder_layer" if (post_layernorm): - model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + print("post_layernorm true") + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True) else: - model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + print("post_layernorm false") + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, tgt_seq_length=tgt_seq_len,num_heads=head_num) hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) - pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) - encoder_pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) - - # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size - # k = model.attention.dense2.weight.asnumpy()#.transpose() - # v = model.attention.dense3.weight.asnumpy()#.transpose() + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() - # w = np.concatenate((q, k, v)) # 3xhid_size x hid_size - # w = w.transpose() # hid_size x 3xhid_size - # wt = M.Tensor(w, w_compute_type) - # bq = model.attention.dense1.bias.asnumpy() - # bk = model.attention.dense2.bias.asnumpy() - # bv = model.attention.dense3.bias.asnumpy() - # bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 - # bt =M.Tensor(bw, w_compute_type) - # print('bt=',bt) - # wp = model.attention.projection.weight - # bp = model.attention.projection.bias - - # omw = model.output.mapping.weight - # opw = model.output.projection.weight - # omb = model.output.mapping.bias - # opb = model.output.projection.bias - - # gl1 = model.layernorm1.gamma - # bl1 = model.layernorm1.beta - # gl2 = model.layernorm2.gamma - # bl2 = model.layernorm2.beta + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + bq = model.attention.dense1.bias.asnumpy() + bk = model.attention.dense2.bias.asnumpy() + bv = model.attention.dense3.bias.asnumpy() + bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + bt =M.Tensor(bw, w_compute_type) + wp = model.attention.projection.weight + bp = model.attention.projection.bias + + qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size + k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() + v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() + + w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size + w2 = w2.transpose() # hid_size x 3xhid_size + wt2 = M.Tensor(w2, w_compute_type) + bq2 = model.cross_attention.dense1.bias.asnumpy() + bk2 = model.cross_attention.dense2.bias.asnumpy() + bv2 = model.cross_attention.dense3.bias.asnumpy() + bw2 = np.concatenate((bq2, bk2, bv2)) #(3xhid) X 1 + bt2 =M.Tensor(bw2, w_compute_type) + wp2 = model.cross_attention.projection.weight + bp2 = model.cross_attention.projection.bias + omw = model.output.mapping.weight + opw = model.output.projection.weight + omb = model.output.mapping.bias + opb = model.output.projection.bias + gl1 = model.layernorm1.gamma + bl1 = model.layernorm1.beta + gl2 = model.layernorm2.gamma + bl2 = model.layernorm2.beta + gl3 = model.cross_attention_layernorm.gamma + bl3 = model.cross_attention_layernorm.beta suffix = str(compute_type) suffix = suffix[-2:] saveT(hidden_stats, name + "_input1.fp" + suffix) saveT(decoder_mask, name + "_input2.fp" + suffix) saveT(encoder_output, name + "_input3.fp" + suffix) saveT(memory_mask, name + "_input4.fp" + suffix) - saveT(pos, name + "_input5.fp" + suffix) - saveT(encoder_pos, name + "_input6.fp" + suffix) - - # saveT(gl1, name + "_weight1.fp" + suffix) - # saveT(bl1, name + "_weight2.fp" + suffix) - # saveT(wt, name + "_weight3.fp" + suffix) - # saveT(bt, name + "_weight4.fp" + suffix) - # saveT(wp, name + "_weight5.fp" + suffix) - # saveT(bp, name + "_weight6.fp" + suffix) - # saveT(gl2, name + "_weight7.fp" + suffix) - # saveT(bl2, name + "_weight8.fp" + suffix) + + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(bl1, name + "_weight2.fp" + suffix) + saveT(wt, name + "_weight3.fp" + suffix) + saveT(bt, name + "_weight4.fp" + suffix) + saveT(wp, name + "_weight5.fp" + suffix) + saveT(bp, name + "_weight6.fp" + suffix) + saveT(gl2, name + "_weight7.fp" + suffix) + saveT(bl2, name + "_weight8.fp" + suffix) + saveT(qt2, name + "_weight9.fp" + suffix) + saveT(wt2, name + "_weight10.fp" + suffix) + saveT(bt2, name + "_weight11.fp" + suffix) + saveT(wp2, name + "_weight12.fp" + suffix) + saveT(bp2, name + "_weight13.fp" + suffix) + saveT(gl3, name + "_weight14.fp" + suffix) + saveT(bl3, name + "_weight15.fp" + suffix) + if(ffn_fp16): + saveTensorToHalf(omw, name + "_weight16.fp" + "16") + saveTensorToHalf(omb, name + "_weight17.fp" + "16") + saveTensorToHalf(opw, name + "_weight18.fp" + "16") + else: + saveT(omw, name + "_weight16.fp" + suffix) + saveT(omb, name + "_weight17.fp" + suffix) + saveT(opw, name + "_weight18.fp" + suffix) + saveT(opb, name + "_weight19.fp" + suffix) # # if app == 'trc': # # saveTensorToHalf(omw, name + "_weight9.fp" + "16") # # saveTensorToHalf(omb, name + "_weight10.fp" + "16") @@ -403,7 +431,27 @@ def transformer_decoder_layer_create(): # saveT(omb, name + "_weight10.fp" + suffix) # saveT(opw, name + "_weight11.fp" + suffix) # saveT(opb, name + "_weight12.fp" + suffix) - _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos) + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') + # if app=="ch": + print('y=',y) + print(y) + f_y=open(f'./{name}_output.txt','w') + # # out_name=get_output_encoder_layer(name + "_fwd.mindir") + # # print("name output:",out_name) + saveCalib("output1", np.array(y), f_y)#2 dims + # # print("y.shpae",np.array(y).shape) + # # saveCalib('Default/Add-op267', y, f_y)#2 dims + f_y.close() + # # saveCalib('Default/Reshape-op296', np.array(y), f_y)#2 dims + # # elif app=="trc": + saveT(y, name + "_output1.fp" + suffix) + # saveT(omw, name + "_weight9.fp" + suffix) + # saveT(omb, name + "_weight10.fp" + suffix) + # saveT(opw, name + "_weight11.fp" + suffix) + # saveT(opb, name + "_weight12.fp" + suffix) + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask) y = model(hidden_stats, decoder_mask, encoder_output, memory_mask) export(model, hidden_stats, decoder_mask, encoder_output, memory_mask,file_name= name + "_fwd", file_format='MINDIR') # if app=="ch": -- Gitee From fc4ccf134c5644ad182507266571115477192954 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Mon, 2 Jan 2023 10:12:47 +0200 Subject: [PATCH 07/39] fix decoder layer fusion --- .../kernel/nnacl/decoder_layer_parameter.h | 35 + .../kernel/nnacl/infer/decoder_layer_infer.c | 38 + .../plugin/device/cpu/kernel/nnacl/op_base.h | 5 +- mindspore/core/ops/decoder_layer.cc | 3 +- mindspore/core/ops/decoder_layer.h | 68 +- .../ops/populate/decoder_layer_populate.cc | 49 + .../delegate/tensorrt/op/decoder_tensorrt.cc | 264 ++ .../delegate/tensorrt/op/decoder_tensorrt.h | 107 + .../delegate/tensorrt/op/encoder_tensorrt.cc | 1 - .../delegate/tensorrt/op/encoder_tensorrt.h | 3 +- .../delegate/tensorrt/tensorrt_utils.h | 3 + .../delegate/tensorrt/tensorrt_utils.cc | 41 + .../litert/delegate/tensorrt/tensorrt_utils.h | 40 + .../optimizer/common/node_pass_extends.cc | 1 - .../optimizer/fusion/decoder_layer_fusion.cc | 401 ++- .../optimizer/fusion/decoder_layer_fusion.h | 7 +- .../001-fast_transformer.patch | 2265 +++++++++++++---- trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 2 +- trc/transformer/ftBench.py | 4 +- trc/transformer/models.txt | 12 +- trc/transformer/train_transformer_export.py | 41 +- 22 files changed, 2625 insertions(+), 767 deletions(-) create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c create mode 100644 mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc create mode 100644 mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc create mode 100644 mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h mode change 100644 => 100755 trc/transformer/cfg_bert.config diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h new file mode 100644 index 00000000000..05872f3a240 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h @@ -0,0 +1,35 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_ + +#include "nnacl/op_base.h" + +typedef struct DecoderLayerParameter { + OpParameter op_parameter_; + int head_num_; + int head_size_; + bool post_layernorm_; + float eps_layernorm1_; + float eps_layernorm2_; + float eps_layernorm3_; + int ffn_hidden_size_; + bool position_bias1_; + bool position_bias2_; +} DecoderLayerParameter; + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c new file mode 100644 index 00000000000..f2f9ac344fe --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c @@ -0,0 +1,38 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "nnacl/infer/decoder_layer_infer.h" +#include "nnacl/infer/infer_register.h" +#include "nnacl/decoder_layer_parameter.h" + +int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size, + OpParameter *parameter) { + printf("DecoderLayerInferShape\n" ); + int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, C23NUM, C1NUM); + if (check_ret != NNACL_OK) { + return check_ret; + } + const TensorC *input = inputs[FIRST_INPUT]; + TensorC *output0 = outputs[FIRST_INPUT]; + SetDataTypeFormat(output0, input); + if (!InferFlag(inputs, inputs_size)) { + return NNACL_INFER_INVALID; + } + SetShapeTensor(output0, input); + return NNACL_OK; +} + +REG_INFER(DecoderLayer, PrimType_DecoderLayer, DecoderLayerInferShape) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h index 1dfa087d795..43f9f2c55fa 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h @@ -41,8 +41,10 @@ #define C12NUM 12 #define C13NUM 13 #define C14NUM 14 +#define C15NUM 15 #define C16NUM 16 #define C18NUM 18 +#define C19NUM 19 #define C20NUM 20 #define C21NUM 21 #define C22NUM 22 @@ -528,8 +530,9 @@ enum PrimType { PrimType_Log1p = 212, PrimType_TensorScatterAdd = 213, PrimType_EncoderLayer = 214, + PrimType_DecoderLayer = 215, PrimType_MIN = PrimType_NONE, - PrimType_MAX = PrimType_EncoderLayer + 1, + PrimType_MAX = PrimType_DecoderLayer + 1, // inner operators. PrimType_Inner_ToFormat = 10000, diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc index baedf3ec902..22c47ff8d89 100644 --- a/mindspore/core/ops/decoder_layer.cc +++ b/mindspore/core/ops/decoder_layer.cc @@ -88,13 +88,14 @@ bool DecoderLayer::get_position_bias2() const { return GetValue(value_ptr); } -void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, +void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, bool post_layernorm = false) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); this->set_eps_layernorm1(eps_layernorm1); this->set_eps_layernorm2(eps_layernorm2); + this->set_eps_layernorm3(eps_layernorm3); this->set_ffn_hidden_size(ffn_hidden_size); this->set_position_bias1(position_bias1); this->set_position_bias2(position_bias2); diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h index c09ce14b7d5..71425ab63d1 100644 --- a/mindspore/core/ops/decoder_layer.h +++ b/mindspore/core/ops/decoder_layer.h @@ -32,8 +32,29 @@ class MIND_API DecoderLayer : public BaseOperator { MIND_API_BASE_MEMBER(DecoderLayer); /// \brief Constructor. DecoderLayer() : BaseOperator(kNameDecoderLayer) { - InitIOName({"input", "gamma1", "beta1", "weight_attn_qkv", "bias_attn_qkv", "mask", "weight_attn_o", "bias_attn_o", - "gamma2", "beta2", "weight_m", "bias_m", "weight_p", "bias_p"}, + InitIOName({"input", + "gamma1", + "beta1", + "weight_qkv", + "bias_attn_qkv", + "input_mask", + "weight_attn_o", + "bias_attn_o", + "gamma2", + "beta2", + "encoder_output", + "weight_attn_q", + "weight_attn_kv", + "bias_attn_cross_qkv", + "cross_mask", + "weight_attn_cross_o", + "bias_attn_cross_o", + "gamma3", + "beta3", + "weight_m", + "bias_m", + "weight_p", + "bias_p"}, {"output"}); } /// \brief Initialize DecoderLayer op. @@ -45,27 +66,28 @@ class MIND_API DecoderLayer : public BaseOperator { /// \param[in] ffn_hidden_size Define ffn hidden size. /// \param[in] position_bias1 Define ffn position_bias1. /// \param[in] position_bias2 Define ffn position_bias2. - void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, - bool position_bias1, bool position_bias2, bool post_layernorm); - void set_head_num(int64_t head_num); - void set_head_size(int64_t head_size); - void set_post_layernorm(bool post_layernorm); - void set_eps_layernorm1(float eps_layernorm1); - void set_eps_layernorm2(float eps_layernorm2); - void set_eps_layernorm3(float eps_layernorm2); - void set_ffn_hidden_size(int64_t ffn_hidden_size); - void set_position_bias1(bool position_bias1); - void set_position_bias2(bool position_bias2); - int64_t get_head_num() const; - int64_t get_head_size() const; - bool get_post_layernorm() const; - float get_eps_layernorm1() const; - float get_eps_layernorm2() const; - float get_eps_layernorm3() const; - int64_t get_ffn_hidden_size() const; - bool get_position_bias1() const; - bool get_position_bias2() const; - }; + void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, + int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, + bool post_layernorm); + void set_head_num(int64_t head_num); + void set_head_size(int64_t head_size); + void set_post_layernorm(bool post_layernorm); + void set_eps_layernorm1(float eps_layernorm1); + void set_eps_layernorm2(float eps_layernorm2); + void set_eps_layernorm3(float eps_layernorm2); + void set_ffn_hidden_size(int64_t ffn_hidden_size); + void set_position_bias1(bool position_bias1); + void set_position_bias2(bool position_bias2); + int64_t get_head_num() const; + int64_t get_head_size() const; + bool get_post_layernorm() const; + float get_eps_layernorm1() const; + float get_eps_layernorm2() const; + float get_eps_layernorm3() const; + int64_t get_ffn_hidden_size() const; + bool get_position_bias1() const; + bool get_position_bias2() const; +}; } // namespace ops } // namespace mindspore #endif // LITE_MINDSPORE_LITE_TOOLS_CONVERTER_OPS_ATTENTION_H_ diff --git a/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc b/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc new file mode 100644 index 00000000000..125f5949de9 --- /dev/null +++ b/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc @@ -0,0 +1,49 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/common/ops/populate/populate_register.h" +#include "nnacl/decoder_layer_parameter.h" + +using mindspore::schema::PrimitiveType_DecoderLayer; + +namespace mindspore { +namespace lite { +OpParameter *PopulateDecoderLayerParameter(const void *prim) { + auto primitive = static_cast(prim); + MS_CHECK_TRUE_RET(primitive != nullptr, nullptr); + auto value = primitive->value_as_DecoderLayer(); + MS_CHECK_TRUE_MSG(value != nullptr, nullptr, "value is nullptr."); + auto *param = reinterpret_cast(malloc(sizeof(DecoderLayerParameter))); + if (param == nullptr) { + MS_LOG(ERROR) << "malloc DecoderLayerParameter failed."; + return nullptr; + } + memset(param, 0, sizeof(DecoderLayerParameter)); + param->op_parameter_.type_ = primitive->value_type(); + param->head_num_ = value->head_num(); + param->head_size_ = value->head_size(); + param->post_layernorm_ = value->post_layernorm(); + param->eps_layernorm1_ = value->eps_layernorm1(); + param->eps_layernorm2_ = value->eps_layernorm2(); + param->eps_layernorm3_ = value->eps_layernorm3(); + param->position_bias1_ = value->position_bias2(); + param->position_bias2_ = value->position_bias2(); + return reinterpret_cast(param); +} + +REG_POPULATE(PrimitiveType_DecoderLayer, PopulateDecoderLayerParameter, SCHEMA_CUR) +} // namespace lite +} // namespace mindspore + diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc new file mode 100644 index 00000000000..2e65299c484 --- /dev/null +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -0,0 +1,264 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h" +#include +#include +#include +#include +#include +#include +#include +#include "NvInferRuntimeCommon.h" +#include "ops/decoder_layer.h" +#include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include "src/fastertransformer/kernels/activation_kernels.h" +#include "src/fastertransformer/utils/cuda_utils.h" +#include "src/fastertransformer/utils/allocator.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" + +namespace mindspore::lite { + +namespace { +constexpr std::size_t kTwo = 2; +constexpr std::size_t kThree = 3; + +} // namespace + +int DecoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (in_tensors.size() != C23NUM) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + // if (out_tensors.size() != 1) { + // MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + // return RET_ERROR; + // } + return RET_OK; +} +nvinfer1::ITensor *DecoderTensorRT::castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, + const std::string &op_name) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor"; + return nullptr; + } + nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape()); + if (dims.nbDims == -1) { + MS_LOG(INFO) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar."; + dims.nbDims = 1; + dims.d[0] = 1; + } + nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType()); + if (!ms_tensor.IsConst()) { + MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name(); + return nullptr; + } + nvinfer1::Weights weights{data_type, ms_tensor.Data(), ms_tensor.ElementNum()}; + if (data_type == nvinfer1::DataType::kFLOAT && is_ffn_fp16_) { + void *data_float16 = malloc(ms_tensor.ElementNum() * sizeof(float)); + if (data_float16 == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return nullptr; + } + auto src = static_cast(ms_tensor.Data()); + auto dst = static_cast(data_float16); + for (int i = 0; i < ms_tensor.ElementNum(); i++) { + dst[i] = static_cast(src[i]); + } + weights.values = data_float16; + } + nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); + if (constant_tensor == nullptr) { + MS_LOG(ERROR) << "create constant_tensor failed."; + return nullptr; + } + ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name); + auto tensor_ptr = constant_tensor->getOutput(0); + return tensor_ptr; +} +int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + auto decoder_op = AsOps(); + if (decoder_op == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + fastertransformer::decoderParamT params; + memset_s(¶ms, sizeof(params), 0, sizeof(params)); + params.head_num = decoder_op->get_head_num(); + params.head_size = decoder_op->get_head_size(); + params.layernorm_post = decoder_op->get_post_layernorm(); + params.eps1 = decoder_op->get_eps_layernorm1(); + params.eps2 = decoder_op->get_eps_layernorm2(); + params.eps3 = decoder_op->get_eps_layernorm3(); + params.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); + params.ffn_fp16 = is_ffn_fp16_; + params.attn1.position_bias = decoder_op->get_position_bias1(); + params.attn2.position_bias = decoder_op->get_position_bias2(); + params.cublas_handle=GetCublasHandle(); + params.attn1.qkv_bias = !params.attn1.position_bias; + params.attn2.qkv_bias = !params.attn2.position_bias; + params.attn1.projection_bias = !params.attn1.position_bias; + params.attn2.projection_bias = !params.attn2.position_bias; + params.attn1.is_cross = false; + params.attn2.is_cross = true; + + params.hidden_size = params.head_num * params.head_size; + auto compute_type = runtime_->GetRuntimePrecisionMode(); + if (is_ffn_fp16_) { + size_t start_fp16 = C15NUM; + size_t end_fp16 = C19NUM; + for (size_t i = 0; i < in_tensors_.size(); i++) { + auto in_tensor = input(ctx, i); + if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { + if (i > start_fp16 && i < end_fp16) { + in_tensor.trt_tensor_ = castTensor(ctx, in_tensors_[i], op_name_); + ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); + } else { + in_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx, in_tensors_[i], op_name_); + ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); + } + } + } + } + nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; + auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + const int input_number = inputs().size(); + nvinfer1::ITensor *inputTensors[input_number]; + for (int i = 0; i < input_number; i++) { + inputTensors[i] = input(ctx, i).trt_tensor_; + } + nvinfer1::IPluginV2Layer *decoder_layer = ctx->network()->addPluginV2(inputTensors, input_number, *plugin); + if (decoder_layer == nullptr) { + MS_LOG(ERROR) << "add decoder op failed for TensorRT."; + return RET_ERROR; + } + decoder_layer->setName((op_name_ + "plugin_decoder_layer").c_str()); + nvinfer1::ITensor *decoder_tensor = decoder_layer->getOutput(0); + ctx->RegisterTensor(ITensorHelper{decoder_tensor, Format::NCHW, true}, out_tensors_[0].Name()); + this->layer_ = decoder_layer; + return RET_OK; +} + +REGISTER_TENSORRT_PLUGIN(DecoderPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int DecoderPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + if (compute_type_ == RuntimePrecisionMode_FP16) { + return RunCudaDecoder(inputDesc, outputDesc, inputs, outputs, workspace, stream, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + } else { + return RunCudaDecoder(inputDesc, outputDesc, inputs, outputs, workspace, stream, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + } +} +template +int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, + void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { + params_.stream = stream; + params_.algo = algoId; + void *inputs_forward[] = { + const_cast(inputs[0]), const_cast(inputs[1]), const_cast(inputs[2]), + const_cast(inputs[3]), const_cast(inputs[4]), const_cast(inputs[5]), + const_cast(inputs[6]), const_cast(inputs[7]), const_cast(inputs[8]), + const_cast(inputs[9]), const_cast(inputs[10]), const_cast(inputs[11]), + const_cast(inputs[12]), const_cast(inputs[13]), const_cast(inputs[14]), const_cast(inputs[15]), + const_cast(inputs[16]), const_cast(inputs[17]), const_cast(inputs[18]), + const_cast(inputs[19]),const_cast(inputs[20]), const_cast(inputs[21]), const_cast(inputs[22])}; + void *outputs_forward[] = {outputs[0]}; + fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, + workspace); +return RET_OK; +} + +bool DecoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, + int nbOutputs) noexcept { + auto type = (compute_type_ == RuntimePrecisionMode_FP16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; + for (int i = 0; i < pos; i++) { + if (tensorsDesc[pos].type != tensorsDesc[i].type) return false; + } + bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type); + return res; +} + +void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { + const int request_batch_size = static_cast(in[0].desc.dims.d[0]); + const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); + const int request_tgt_seq_len = request_src_seq_len; + params_.batch_size = request_batch_size; + params_.src_seq_len = request_src_seq_len; + params_.tgt_seq_len = request_tgt_seq_len; + num_of_inputs_ = nbInputs; + num_of_outputs_ = nbOutputs; +} +size_t DecoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { + + if (compute_type_ == RuntimePrecisionMode_FP16) { + return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); + } else { + return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); + } +} + +nvinfer1::DimsExprs DecoderPlugin::getOutputDimensions(int32_t index, const nvinfer1::DimsExprs *inputs, + int nbInputDims, nvinfer1::IExprBuilder &exprBuilder) noexcept { + nvinfer1::DimsExprs dims; + if (index == 0) { + int num_dims = inputs[0].nbDims; + dims.nbDims = num_dims; + if (num_dims == INPUT_SIZE2) { + dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); + dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); + } else if (num_dims == INPUT_SIZE3) { + dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); + dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); + dims.d[2] = exprBuilder.constant(inputs[0].d[2]->getConstantValue()); + } + } + return dims; +} + +nvinfer1::IPluginV2DynamicExt *DecoderPlugin::clone() const noexcept { + auto *plugin = new DecoderPlugin(*this); // TODO(haim) CopyConstructor + if (plugin == nullptr) { + MS_LOG(ERROR) << "plugin is null"; + return nullptr; + } + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +size_t DecoderPlugin::getSerializationSize() const noexcept { return sizeof(int) + sizeof(fastertransformer::decoderParamT); } + +void DecoderPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &compute_type_, sizeof(int)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::decoderParamT)); +} +REGISTER_TENSORRT_CREATOR(ops::kNameDecoderLayer, DecoderTensorRT) +} // namespace mindspore::lite \ No newline at end of file diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h new file mode 100644 index 00000000000..9b1f5456193 --- /dev/null +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -0,0 +1,107 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_ + +#include +#include +#include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" +#include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" +#include "src/fastertransformer/layers/decoder_layers/decoder.h" +#include "src/extendrt/delegate/tensorrt/tensorrt_utils.h" +namespace mindspore::lite { +class DecoderTensorRT : public TensorRTOp { + public: + DecoderTensorRT(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, + const std::vector &out_tensors, std::string name) + : TensorRTOp(base_operator, in_tensors, out_tensors, name) {} + + ~DecoderTensorRT() override = default; + bool IsWeightInputHanledInner() const override { return is_ffn_fp16_; } + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); + bool is_ffn_fp16_ = false; +}; + +constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"}; +class DecoderPlugin : public TensorRTPlugin { + public: + DecoderPlugin(const std::string name, int compute_type, fastertransformer::decoderParamT params, + cublasLtHandle_t cublaslt_handle, uint32_t device_id) + : TensorRTPlugin(name, std::string(DECODER_PLUGIN_NAME), device_id), + compute_type_(compute_type), + params_(params), + cublaslt_handle_(cublaslt_handle) + {} + + DecoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + compute_type_ = static_cast(fields[0].data)[0]; + params_ = static_cast(fields[1].data)[0]; + } + + DecoderPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::decoderParamT)); + } + + DecoderPlugin() = delete; + + ~DecoderPlugin() override {} + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs *inputs, int nbInputDims, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override; + bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, + int nbOutputs) noexcept override; + + private: + const std::string layer_name_; + std::string name_space_; + int compute_type_; + mutable fastertransformer::decoderParamT params_; + cublasLtHandle_t cublaslt_handle_; + int num_of_inputs_; + int num_of_outputs_; + + template + int RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, + cublasGemmAlgo_t algoId); +}; +class DecoderPluginCreater : public TensorRTPluginCreater { + public: + DecoderPluginCreater() : TensorRTPluginCreater(std::string(DECODER_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_ diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index d415d6d59b1..5c3664b9465 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -91,7 +91,6 @@ nvinfer1::ITensor *EncoderTensorRT::castTensor(TensorRTContext *ctx, const Tenso auto tensor_ptr = constant_tensor->getOutput(0); return tensor_ptr; } - int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { if (ctx == nullptr || ctx->network() == nullptr) { MS_LOG(ERROR) << "context or network is invalid"; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index 45da8ab88a5..fd9b334021e 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -38,7 +38,8 @@ class EncoderTensorRT : public TensorRTOp { const std::vector &out_tensors) override; private: - nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); + nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, + const std::string &op_name); bool is_ffn_fp16_ = false; }; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h index 18baf21654f..5454bf70984 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h @@ -216,5 +216,8 @@ void Data2Vector(std::vector *dst, const void *src) { dst->at(i) = static_cast(src_ptr[i]); } } + +// nvinfer1::ITensor *castTensorFp32ToFp16(TensorRTContext *ctx, const TensorInfo &ms_tensor, +// const std::string &op_name); } // namespace mindspore::lite #endif // MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_TENSORRT_UTILS_H_ diff --git a/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.cc b/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.cc index ae431f5d36d..64cca0083ee 100644 --- a/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.cc +++ b/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.cc @@ -826,6 +826,47 @@ void DebugDims(const std::string &key, const nvinfer1::Dims &dims) { } } +// nvinfer1::ITensor *castTensorFp32ToFp16(TensorRTContext *ctx, const TensorInfo &ms_tensor, +// const std::string &op_name) { +// if (ctx == nullptr || ctx->network() == nullptr) { +// MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor"; +// return nullptr; +// } +// nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape()); +// if (dims.nbDims == -1) { +// MS_LOG(INFO) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar."; +// dims.nbDims = 1; +// dims.d[0] = 1; +// } +// nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType()); +// if (!ms_tensor.IsConst()) { +// MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name(); +// return nullptr; +// } +// nvinfer1::Weights weights{data_type, ms_tensor.Data(), ms_tensor.ElementNum()}; +// if (data_type == nvinfer1::DataType::kFLOAT && is_ffn_fp16_) { +// void *data_float16 = malloc(ms_tensor.ElementNum() * sizeof(float)); +// if (data_float16 == nullptr) { +// MS_LOG(ERROR) << "Malloc buffer failed."; +// return nullptr; +// } +// auto src = static_cast(ms_tensor.Data()); +// auto dst = static_cast(data_float16); +// for (int i = 0; i < ms_tensor.ElementNum(); i++) { +// dst[i] = static_cast(src[i]); +// } +// weights.values = data_float16; +// } +// nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); +// if (constant_tensor == nullptr) { +// MS_LOG(ERROR) << "create constant_tensor failed."; +// return nullptr; +// } +// ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name); +// auto tensor_ptr = constant_tensor->getOutput(0); +// return tensor_ptr; +// } + template <> nvinfer1::DataType GetNvinferDataType() { return nvinfer1::DataType::kFLOAT; diff --git a/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.h b/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.h index efaed1c5d54..d40bfd73da3 100644 --- a/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.h +++ b/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.h @@ -187,5 +187,45 @@ void Data2Vector(std::vector *dst, const void *src) { dst->at(i) = static_cast(src_ptr[i]); } } +// nvinfer1::ITensor *castTensorFp32ToFp16(TensorRTContext *ctx, const TensorInfo &ms_tensor, +// const std::string &op_name) { +// if (ctx == nullptr || ctx->network() == nullptr) { +// MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor"; +// return nullptr; +// } +// nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape()); +// if (dims.nbDims == -1) { +// MS_LOG(INFO) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar."; +// dims.nbDims = 1; +// dims.d[0] = 1; +// } +// nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType()); +// if (!ms_tensor.IsConst()) { +// MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name(); +// return nullptr; +// } +// nvinfer1::Weights weights{data_type, ms_tensor.Data(), ms_tensor.ElementNum()}; +// if (data_type == nvinfer1::DataType::kFLOAT && is_ffn_fp16_) { +// void *data_float16 = malloc(ms_tensor.ElementNum() * sizeof(float)); +// if (data_float16 == nullptr) { +// MS_LOG(ERROR) << "Malloc buffer failed."; +// return nullptr; +// } +// auto src = static_cast(ms_tensor.Data()); +// auto dst = static_cast(data_float16); +// for (int i = 0; i < ms_tensor.ElementNum(); i++) { +// dst[i] = static_cast(src[i]); +// } +// weights.values = data_float16; +// } +// nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); +// if (constant_tensor == nullptr) { +// MS_LOG(ERROR) << "create constant_tensor failed."; +// return nullptr; +// } +// ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name); +// auto tensor_ptr = constant_tensor->getOutput(0); +// return tensor_ptr; +// } } // namespace mindspore::lite #endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_UTILS_H_ diff --git a/mindspore/lite/tools/optimizer/common/node_pass_extends.cc b/mindspore/lite/tools/optimizer/common/node_pass_extends.cc index bcfff797a84..42bad5c5aa2 100644 --- a/mindspore/lite/tools/optimizer/common/node_pass_extends.cc +++ b/mindspore/lite/tools/optimizer/common/node_pass_extends.cc @@ -55,7 +55,6 @@ bool LiteNodePass::Run(const FuncGraphPtr &func_graph) { continue; } (void)seen_node.insert(node); - // std::cout << "node->debug_info=" << node->debug_info() << std::endl; AnfNodePtr new_node = Run(func_graph, node); bool change = (new_node != nullptr); if (new_node != nullptr && new_node != node) { diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 9b066d80489..6c61f05bb44 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -98,30 +98,6 @@ bool DecoderLayerFusion::Init() const { return true; } -// STATUS GetIntParameterData(const ParameterPtr ¶m_ptr, std::vector *result) { -// if (param_ptr == nullptr || !param_ptr->has_default()) { -// MS_LOG(DEBUG) << "param not have default"; -// return RET_ERROR; -// } -// auto default_param = param_ptr->default_param(); -// if (default_param == nullptr || !utils::isa(default_param)) { -// MS_LOG(DEBUG) << "tensor_info is not tensor::TensorPtr"; -// return RET_ERROR; -// } -// auto default_param_ptr = utils::cast(default_param); -// if (default_param_ptr->data_type() != kNumberTypeInt32 && default_param_ptr->data_type() != kNumberTypeInt) { -// MS_LOG(DEBUG) << "default param is not int"; -// return RET_ERROR; -// } -// auto ptr = reinterpret_cast(default_param_ptr->data_c()); -// int64_t shape_size = -// std::accumulate(default_param_ptr->shape().begin(), default_param_ptr->shape().end(), 1, std::multiplies<>()); -// for (int64_t i = 0; i < shape_size; i++) { -// result->emplace_back(ptr[i]); -// } -// return RET_OK; -// } - VectorRef DecoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusion = false, bool is_position_bias = false) const { auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); @@ -129,9 +105,6 @@ VectorRef DecoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio auto var1 = std::make_shared("var1-reshape"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); - if (post_layernorm) { - return reshape1; - } VectorRef layer_norm, tuple; if (layernorm_fusion) { return DefineLayerNorm(reshape1, gamma1_, beta1_); @@ -174,6 +147,11 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr std::cout << "DefinePatternDecoderLayer post=" << post_layernorm << " layernorm_fusion=" << layernorm_fusion << std::endl; std::cout << "attention no position bias" << std::endl; + auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); + MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); + auto var1 = std::make_shared("var1-reshape"); + MS_CHECK_TRUE_RET(var1 != nullptr, {}); + auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); auto attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, @@ -182,10 +160,12 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr auto var_tuple4 = std::make_shared("var_tuple4"); auto tuple4 = VectorRef({is_tuple4, attention, var_tuple4}); auto is_add2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add2"); - auto add2 = VectorRef({is_add2, getTuple(post_layernorm, layernorm_fusion, is_position_bias), tuple4}); - // } else if (layernorm_fusion) { - // add = VectorRef({is_add, getTuple(post_layernorm, layernorm_fusion), tuple3}); - VectorRef layer_norm2, tuple2, tuple3, layer_norm3; + VectorRef add2, layer_norm2, tuple2, tuple3, layer_norm3, add3, reshape4; + if (post_layernorm) { + add2 = VectorRef({is_add2, getTuple(post_layernorm, layernorm_fusion, is_position_bias), tuple4}); + } else { + add2 = VectorRef({is_add2, reshape1, tuple4}); + } if (layernorm_fusion) { layer_norm2 = DefineLayerNorm(add2, gamma2_, beta2_); tuple2 = layer_norm2; @@ -207,7 +187,11 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr auto tuple5 = VectorRef({is_tuple5, attention_cross, var_tuple5}); auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); MS_CHECK_TRUE_RET(is_add2 != nullptr, {}); - auto add3 = VectorRef({is_add3, tuple2, tuple5}); + if (post_layernorm) { + add3 = VectorRef({is_add3, tuple2, tuple5}); + } else { + add3 = VectorRef({is_add3, add2, tuple5}); + } if (layernorm_fusion) { layer_norm3 = DefineLayerNorm(add3, gamma3_, beta3_); tuple3 = layer_norm3; @@ -233,12 +217,16 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr MS_CHECK_TRUE_RET(is_reshape4 != nullptr, {}); auto var4 = std::make_shared("var4"); MS_CHECK_TRUE_RET(var4 != nullptr, {}); - auto reshape4 = VectorRef({is_reshape4, tuple3, var4}); + if (post_layernorm) { + reshape4 = VectorRef({is_reshape4, tuple3, var4}); + } else { + reshape4 = VectorRef({is_reshape4, add3, var4}); + } auto is_add4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add4"); auto add4 = VectorRef({is_add4, reshape4, reshape3}); - if (!post_layernorm || layernorm_fusion) { - return add4; - } + // if (!post_layernorm || layernorm_fusion) { + // return add4; + // } return add4; // auto is_reshape4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); // MS_CHECK_TRUE_RET(is_reshape4 != nullptr, {}); @@ -270,8 +258,9 @@ std::unordered_map DecoderLayerFusion::DefinePatterns() MS_LOG(ERROR) << "initial member failed."; return patterns; } - patterns[kPatternDecoderLayer] = DefinePatternDecoderLayer(false, false, false); - std::cout << "patterns[kPatternDecoderLayer]" << patterns[kPatternDecoderLayer].ToString() << std::endl; + patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, false, false); + patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, false, false); + // std::cout << "patterns[kPatternDecoderLayer]" << patterns[kPatternDecoderLayer].ToString() << std::endl; return patterns; } @@ -281,8 +270,10 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi return nullptr; } std::cout << "found pattern " << pattern_name << std::endl; - if (pattern_name == kPatternDecoderLayer) { + if (pattern_name == kPatternDecoderLayerPre) { return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, false); + } else if (pattern_name == kPatternDecoderLayerPost) { + return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, true); } return nullptr; } @@ -322,8 +313,8 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi // return true; // } STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, - int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1, - bool *is_position_bias2) const { + int *head_size, float *eps1, float *eps2, float *eps3, bool + *is_position_bias1, bool *is_position_bias2) const { if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { printf("is_attention_ is not AnfNodePtr"); return RET_ERROR; @@ -479,202 +470,150 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq // return false; // } return RET_OK; -} -// STATUS EncoderLayerFusion::RemoveRedundantInput(const FuncGraphPtr &func_graph, -// const std::vector &redundant) const { -// for (auto &node : redundant) { -// func_graph->DropNode(node); -// } -// return RET_OK; -// } - - std::shared_ptr DecoderLayerFusion::BuildDecoderLayerFusionPrim(const EquivPtr &equiv) const { - MS_ASSERT(equiv != nullptr); - auto decoder_layer_prim = std::make_shared(); - if (decoder_layer_prim == nullptr) { - MS_LOG(ERROR) << "Build decoder_layer primitive failed."; - return decoder_layer_prim; - } - if (!utils::isa((*equiv)[reshape_k_])) { - MS_LOG(ERROR) << "Reshape k is not a parameter"; - return nullptr; - } - - if (!utils::isa((*equiv)[reshape_v_])) { - MS_LOG(ERROR) << "Reshape v is not a parameter"; - return nullptr; - } - - auto reshape_k = utils::cast((*equiv)[reshape_k_]); - std::vector shape_k; - // if (RET_OK != GetIntParameterData(reshape_k, &shape_k)) { - // MS_LOG(ERROR) << "Get reshape k data failed"; - // return nullptr; - // } + } - auto reshape_v = utils::cast((*equiv)[reshape_v_]); - std::vector shape_v; - // if (RET_OK != GetIntParameterData(reshape_v, &shape_v)) { - // MS_LOG(ERROR) << "Get reshape k data failed"; - // return nullptr; - // } - if (shape_k.size() < kWeightShapeSize || shape_v.size() < kWeightShapeSize || - shape_k.at(shape_k.size() - kWeightShapeSize) != shape_v.at(shape_v.size() - kWeightShapeSize)) { - MS_LOG(ERROR) << "Shape k or shape v is invalid."; - return nullptr; - } - return decoder_layer_prim; - } - CNodePtr DecoderLayerFusion::CreateOutputGetItem(const FuncGraphPtr &func_graph, const CNodePtr &node, - const int item_index) const { - MS_ASSERT(func_graph != nullptr); - MS_ASSERT(node != nullptr); - auto tuple_get_item_prim = std::make_shared(); - auto get_item_value = NewValueNode(MakeValue(item_index)); - if (tuple_get_item_prim == nullptr || get_item_value == nullptr) { - MS_LOG(ERROR) << "NewValueNode is nullptr"; +std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + bool post_layernorm, int64_t ffn_hidden_size) const { + auto decoder_layer_prim = std::make_shared(); + if (decoder_layer_prim == nullptr) { + MS_LOG(ERROR) << "Build decoder layer primitive failed."; return nullptr; } - auto tuple_get_item_prim_c = tuple_get_item_prim->GetPrim(); - MS_ASSERT(tuple_get_item_prim_c != nullptr); - CNodePtr get_item_cnode = func_graph->NewCNode(tuple_get_item_prim_c, {node, get_item_value}); - MS_CHECK_TRUE_RET(get_item_cnode != nullptr, nullptr); - auto abstract = lite::CreateTensorAbstract({}, kNumberTypeFloat32); - if (abstract == nullptr) { - MS_LOG(ERROR) << "Create tensor abstract failed"; + int head_num = 0; + int head_size = 0; + float eps1 = 1e-6; + float eps2 = 1e-6; + float eps3 = 1e-6; + bool is_position_bias1 = false; + bool is_position_bias2 = false; + if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1, + &is_position_bias2)) { return nullptr; } - get_item_cnode->set_abstract(abstract); - get_item_cnode->set_fullname_with_scope(node->fullname_with_scope() + "_output_getitem_" + - std::to_string(item_index)); - return get_item_cnode; + // add eps3 + decoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3,ffn_hidden_size, is_position_bias1, is_position_bias2, + post_layernorm); + return decoder_layer_prim; } - std::shared_ptr DecoderLayerFusion::CreatePrim( - const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const { - auto decoder_layer_prim = std::make_shared(); - if (decoder_layer_prim == nullptr) { - MS_LOG(ERROR) << "Build decoder layer primitive failed."; - return nullptr; - } - int head_num = 0; - int head_size = 0; - float eps1 = 1e-6; - float eps2 = 1e-6; - // float eps3 = 1e-6; - bool is_position_bias1 = false; - bool is_position_bias2 = false; - // if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1, - // &is_position_bias2)) { - // return nullptr; - // } - //add eps3 - decoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias1, - is_position_bias2, post_layernorm); - return decoder_layer_prim; - } - CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode( - const FuncGraphPtr &func_graph, const EquivPtr &equiv, const AnfNodePtr &node, bool post_layernorm = true) const { - std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; - MS_ASSERT(func_graph != nullptr); - MS_ASSERT(equiv != nullptr); - MS_ASSERT(node != nullptr); - // bool is_position_bias = false; - auto input = utils::cast((*equiv)[hidden_stats_]); - std::cout << "input" << std::endl; - auto encoder_output = utils::cast((*equiv)[encoder_output_]); - std::cout << "encoder_output" << std::endl; - AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3; - auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); - std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; - auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); - std::cout << "weight_attn_o" << std::endl; - auto weight_attn_q = utils::cast((*equiv)[weight_attn_q_]); - auto weight_attn_kv = utils::cast((*equiv)[weight_attn_kv_]); - auto weight_attn_cross_o = utils::cast((*equiv)[weight_attn_cross_o_]); - std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; - auto weight_m = utils::cast((*equiv)[weight_m_]); - std::cout << "weight_m" << std::endl; - auto weight_p = utils::cast((*equiv)[weight_p_]); - std::cout << "weight_p" << std::endl; - auto bias_attn_cross_qkv = utils::cast((*equiv)[bias_attn_cross_qkv_]); - auto bias_attn_cross_o = utils::cast((*equiv)[bias_attn_cross_o_]); - bias_m = utils::cast((*equiv)[bias_m_]); - bias_p = utils::cast((*equiv)[bias_p_]); - beta1 = utils::cast((*equiv)[beta1_]); - std::cout << "beta1" << std::endl; - beta2 = utils::cast((*equiv)[beta2_]); - std::cout << "beta2" << std::endl; - beta3 = utils::cast((*equiv)[beta3_]); - std::cout << "beta3" << std::endl; - auto gamma1 = utils::cast((*equiv)[gamma1_]); - std::cout << "gamma1" << std::endl; - auto gamma2 = utils::cast((*equiv)[gamma2_]); - std::cout << "gamma2" << std::endl; - auto gamma3 = utils::cast((*equiv)[gamma3_]); - std::cout << "gamma3" << std::endl; +CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, + const AnfNodePtr &node, + bool post_layernorm = true) const { + std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; + MS_ASSERT(func_graph != nullptr); + MS_ASSERT(equiv != nullptr); + MS_ASSERT(node != nullptr); + // bool is_position_bias = false; + auto input = utils::cast((*equiv)[hidden_stats_]); + MS_ASSERT(input != nullptr); + std::cout << "input" << std::endl; + auto encoder_output = utils::cast((*equiv)[encoder_output_]); + MS_ASSERT(encoder_output != nullptr); + std::cout << "encoder_output" << std::endl; + AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3; + auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); + MS_ASSERT(weight_qkv != nullptr); + bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); + bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); + MS_ASSERT(weight_qkv != nullptr); + std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; + auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); + MS_ASSERT(weight_attn_o != nullptr); + std::cout << "weight_attn_o" << std::endl; + auto weight_attn_q = utils::cast((*equiv)[weight_attn_q_]); + MS_ASSERT(weight_attn_q != nullptr); + auto weight_attn_kv = utils::cast((*equiv)[weight_attn_kv_]); + MS_ASSERT(weight_attn_kv != nullptr); + auto weight_attn_cross_o = utils::cast((*equiv)[weight_attn_cross_o_]); + MS_ASSERT(weight_attn_cross_o != nullptr); + std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; + auto weight_m = utils::cast((*equiv)[weight_m_]); + MS_ASSERT(weight_m != nullptr); + std::cout << "weight_m" << std::endl; + auto weight_p = utils::cast((*equiv)[weight_p_]); + MS_ASSERT(weight_p != nullptr); + std::cout << "weight_p" << std::endl; + auto bias_attn_cross_qkv = utils::cast((*equiv)[bias_attn_cross_qkv_]); + MS_ASSERT(bias_attn_cross_qkv != nullptr); + auto bias_attn_cross_o = utils::cast((*equiv)[bias_attn_cross_o_]); + MS_ASSERT(bias_attn_cross_o != nullptr); + bias_m = utils::cast((*equiv)[bias_m_]); + MS_ASSERT(bias_m != nullptr); + bias_p = utils::cast((*equiv)[bias_p_]); + MS_ASSERT(bias_p != nullptr); + beta1 = utils::cast((*equiv)[beta1_]); + MS_ASSERT(beta1 != nullptr); + std::cout << "beta1" << std::endl; + beta2 = utils::cast((*equiv)[beta2_]); + MS_ASSERT(beta2 != nullptr); + std::cout << "beta2" << std::endl; + beta3 = utils::cast((*equiv)[beta3_]); + MS_ASSERT(beta3 != nullptr); + std::cout << "beta3" << std::endl; + auto gamma1 = utils::cast((*equiv)[gamma1_]); + MS_ASSERT(gamma1 != nullptr); + std::cout << "gamma1" << std::endl; + auto gamma2 = utils::cast((*equiv)[gamma2_]); + MS_ASSERT(gamma2 != nullptr); + std::cout << "gamma2" << std::endl; + auto gamma3 = utils::cast((*equiv)[gamma3_]); + MS_ASSERT(gamma3 != nullptr); + std::cout << "gamma3" << std::endl; + + input_mask = utils::cast((*equiv)[mask_]); + MS_ASSERT(input_mask != nullptr); + std::cout << "input_mask" << std::endl; + auto cross_mask = utils::cast((*equiv)[cross_mask_]); + MS_ASSERT(cross_mask != nullptr); + std::cout << "input_mask" << std::endl; + auto base_shape_ptr = weight_m->Shape(); + MS_EXCEPTION_IF_NULL(base_shape_ptr); + auto input_shape_ptr = base_shape_ptr->cast(); + MS_EXCEPTION_IF_NULL(input_shape_ptr); + auto input_shape = input_shape_ptr->shape(); + MS_ASSERT(input_shape != nullptr); + int ffn_hidden_size = (int64_t)input_shape[1]; + std::cout << ffn_hidden_size << std::endl; + auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size); + MS_CHECK_TRUE_RET(decoder_layer_prim != nullptr, nullptr); + auto decoder_layer_prim_c = decoder_layer_prim->GetPrim(); + MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr); + auto value_node = NewValueNode(decoder_layer_prim_c); + MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); + std::cout << "value_node" << std::endl; + std::vector new_node_inputs = {value_node, + input, + gamma1, + beta1, + weight_qkv, + bias_attn_qkv, + input_mask, + weight_attn_o, + bias_attn_o, + gamma2, + beta2, + encoder_output, + weight_attn_q, + weight_attn_kv, + bias_attn_cross_qkv, + cross_mask, + weight_attn_cross_o, + bias_attn_cross_o, + gamma3, + beta3, + weight_m, + bias_m, + weight_p, + bias_p}; - input_mask = utils::cast((*equiv)[mask_]); - std::cout << "input_mask" << std::endl; - auto cross_mask = utils::cast((*equiv)[cross_mask_]); - std::cout << "input_mask" << std::endl; - auto base_shape_ptr = weight_m->Shape(); - MS_EXCEPTION_IF_NULL(base_shape_ptr); - auto input_shape_ptr = base_shape_ptr->cast(); - MS_EXCEPTION_IF_NULL(input_shape_ptr); - auto input_shape = input_shape_ptr->shape(); - MS_ASSERT(input_shape != nullptr); - int ffn_hidden_size = (int64_t)input_shape[1]; // TODO - auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size); - MS_CHECK_TRUE_RET(decoder_layer_prim != nullptr, nullptr); - auto decoder_layer_prim_c = decoder_layer_prim->GetPrim(); - MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr); - auto value_node = NewValueNode(decoder_layer_prim_c); - MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); - std::cout << "value_node" << std::endl; - std::vector new_node_inputs; - if (post_layernorm) { - new_node_inputs = {value_node, input, weight_qkv, bias_attn_qkv, input_mask, - weight_attn_o, bias_attn_o, gamma1, beta1, weight_m, - bias_m, weight_p, bias_p, gamma2, beta2}; - } else { - new_node_inputs = {value_node, - input, - gamma1, - beta1, - weight_qkv, - bias_attn_qkv, - input_mask , - weight_attn_o, - bias_attn_o, - gamma2, - beta2, - encoder_output, - weight_attn_q, - weight_attn_kv, - bias_attn_cross_qkv, - cross_mask, - weight_attn_cross_o, - bias_attn_cross_o, - gamma3, - beta3, - weight_m, - bias_m, - weight_p, - bias_p}; - } - std::cout << "new_node_inputs" << std::endl; - auto new_node = func_graph->NewCNode(new_node_inputs); - std::cout << "new_node" << std::endl; - MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); - auto old_node = node->cast(); - std::cout << "old_node" << std::endl; - MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); - new_node->set_abstract(old_node->abstract()->Clone()); - std::cout << "new_node" << std::endl; - new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); - std::cout << new_node->ToString() << std::endl; - std::cout << "new_node" << std::endl; - return new_node; - } + auto new_node = func_graph->NewCNode(new_node_inputs); + MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); + auto old_node = node->cast(); + MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); + new_node->set_abstract(old_node->abstract()->Clone()); + new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); + std::cout << new_node->ToString() << std::endl; + + return new_node; +} } // namespace mindspore::opt \ No newline at end of file diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h index c7576df919d..a09b7b7823f 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -43,9 +43,6 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { protected: virtual bool Init() const; - // create multi-head-attention without mask - virtual std::shared_ptr BuildDecoderLayerFusionPrim(const EquivPtr &equiv) const; - private: VectorRef DefinePatternDecoderLayer(bool post_layernorm,bool layernorm_fusion, bool is_position_bias) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; @@ -53,12 +50,12 @@ protected: CNodePtr CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,const AnfNodePtr &node, bool post_layernorm ) const; std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const; - CNodePtr CreateOutputGetItem(const FuncGraphPtr &func_graph, const CNodePtr &node, const int item_index) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, float *eps1, float *eps2, float *eps3, bool* is_position_bias1, bool* is_position_bias2) const; protected: - const std::string kPatternDecoderLayer = "PatternDecoderLayer"; + const std::string kPatternDecoderLayerPre = "PatternDecoderLayerPre"; + const std::string kPatternDecoderLayerPost = "PatternDecoderLayerPost"; mutable VarPtr hidden_stats_{nullptr}; mutable VarPtr encoder_output_{nullptr}; mutable VarPtr position_bias_{nullptr}; diff --git a/third_party/patch/fast_transformer/001-fast_transformer.patch b/third_party/patch/fast_transformer/001-fast_transformer.patch index 2080c0ce7a7..020fcefb4e3 100644 --- a/third_party/patch/fast_transformer/001-fast_transformer.patch +++ b/third_party/patch/fast_transformer/001-fast_transformer.patch @@ -132,7 +132,7 @@ index 8707220..c9369e0 100644 target_link_libraries(trt_fused_multi_head_attention PUBLIC -lcublas -lcudart) set_property(TARGET trt_fused_multi_head_attention PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/CMakeLists.txt b/CMakeLists.txt -index ea21014..f9e08b8 100644 +index ea21014..be872d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,9 @@ @@ -195,15 +195,16 @@ index ea21014..f9e08b8 100644 ######################################## if(BUILD_MULTI_GPU) -@@ -249,6 +256,7 @@ add_library(transformer-static STATIC +@@ -249,6 +256,8 @@ add_library(transformer-static STATIC $ $ $ + $ ++ $ $ $ $ -@@ -313,8 +321,9 @@ add_library(transformer-static STATIC +@@ -313,8 +322,9 @@ add_library(transformer-static STATIC set_property(TARGET transformer-static PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET transformer-static PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(transformer-static PUBLIC -lcudart -lnccl -lmpi -lcublas -lcublasLt -lcurand) @@ -214,7 +215,7 @@ index ea21014..f9e08b8 100644 $ $ $ -@@ -324,29 +333,10 @@ add_library(transformer-shared SHARED +@@ -324,29 +334,11 @@ add_library(transformer-shared SHARED $ $ $ @@ -222,6 +223,7 @@ index ea21014..f9e08b8 100644 - $ - $ + $ ++ $ $ - $ $ @@ -245,7 +247,7 @@ index ea21014..f9e08b8 100644 $ $ $ -@@ -373,9 +363,7 @@ add_library(transformer-shared SHARED +@@ -373,9 +365,7 @@ add_library(transformer-shared SHARED $ $ $ @@ -255,7 +257,7 @@ index ea21014..f9e08b8 100644 $ $ $ -@@ -387,14 +375,22 @@ add_library(transformer-shared SHARED +@@ -387,14 +377,22 @@ add_library(transformer-shared SHARED $ $ $) @@ -280,7 +282,7 @@ index ea21014..f9e08b8 100644 include(CMakePackageConfigHelpers) configure_package_config_file( ${CMAKE_CURRENT_LIST_DIR}/cmake/FasterTransformerConfig.cmake.in -@@ -402,52 +398,23 @@ configure_package_config_file( +@@ -402,52 +400,23 @@ configure_package_config_file( INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) @@ -422,7 +424,7 @@ index cacb09e..5fec0c9 100644 else if (std::is_same::value) { diff --git a/examples/cpp/ms/CMakeLists.txt b/examples/cpp/ms/CMakeLists.txt new file mode 100644 -index 0000000..eb47b5c +index 0000000..52f9a5e --- /dev/null +++ b/examples/cpp/ms/CMakeLists.txt @@ -0,0 +1,22 @@ @@ -443,21 +445,23 @@ index 0000000..eb47b5c +add_executable(ms_benchmark ms.cc) +if (SPARSITY_SUPPORT) +# target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt transformer-shared) -+target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt GptContextAttentionLayer EncoderLayer) ++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt GptContextAttentionLayer EncoderLayer DecoderLayer) +else() +# target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart transformer-shared) -+target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart GptContextAttentionLayer EncoderLayer) ++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart GptContextAttentionLayer EncoderLayer DecoderLayer) +endif() diff --git a/examples/cpp/ms/initialize.h b/examples/cpp/ms/initialize.h new file mode 100644 -index 0000000..9bcf4eb +index 0000000..db057ad --- /dev/null +++ b/examples/cpp/ms/initialize.h -@@ -0,0 +1,643 @@ +@@ -0,0 +1,783 @@ +#pragma once + +#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" +#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" ++#include "src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h" ++#include "src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h" +#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h" +#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h" +using namespace fastertransformer; @@ -471,7 +475,11 @@ index 0000000..9bcf4eb + size_t size_per_head; + float eps1; + float eps2; ++ float eps3; ++ bool position_bias1; ++ bool position_bias2; + bool post_layernorm_residual; ++ bool is_ffn_fp16; + bool is_remove_padding; + std::string model_name; + std::string compute_type; @@ -499,14 +507,24 @@ index 0000000..9bcf4eb + BaseEncoderLayer* Encoder; + // +}; -+ ++template ++struct DecriptorDecoderLayer { ++ std::vector input_tensors; // GPU ++ std::vector input_python_tensors; // CPU ++ std::vector output_tensors; // GPU ++ std::vector output_python_tensors; // CPU ++ std::vector w_tensors; ++ BaseDecoderLayer* Decoder; ++ // ++}; +typedef enum { -+ MHA_X1 = 1, // AttnIn + AttnMask -+ MHA_X2, // AttnIn + EncOut -- same seq size + AttnMask -+ MHA_CROSS, // AttnIn + EncOut + AttnMAsk -+ MHA_T5, // AttnIn + EncOut + AttnMAsk + position_bias -+ MHA_T5_CROSS, // AttnIn + EncOut + AttnMAsk + position_bias -+ TEL, // transformer encoder layer ++ MHA_X1 = 1, // AttnIn + AttnMask ++ MHA_X2, // AttnIn + EncOut -- same seq size + AttnMask ++ MHA_CROSS, // AttnIn + EncOut + AttnMAsk ++ MHA_T5, // AttnIn + EncOut + AttnMAsk + position_bias ++ MHA_T5_CROSS, // AttnIn + EncOut + AttnMAsk + position_bias ++ TEL, // transformer encoder layer ++ TDL, +} MODEL_TEST_ID_E; + +int ModelNum(std::string model_name) @@ -522,11 +540,17 @@ index 0000000..9bcf4eb + } + else if (model_name == "mha_T5") { + return MHA_T5; -+ } else if (model_name == "mha_T5_cross") { ++ } ++ else if (model_name == "mha_T5_cross") { + return MHA_T5_CROSS; -+ } else if (model_name == "transformer_encoder_layer") { ++ } ++ else if (model_name == "transformer_encoder_layer") { + return TEL; -+ } else { ++ } ++ else if (model_name == "transformer_decoder_layer" ||model_name == "transformer_decoder_layer_t5") { ++ return TDL; ++ } ++ else { + return -1; + } +} @@ -542,37 +566,29 @@ index 0000000..9bcf4eb + + // TODO Nizzan - check if need to be + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ false, //is_cross -+ false, // sparse -+ false); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false, // is_cross ++ false, // sparse ++ false); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, -+ 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0}); + // desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, + // getTensorType(), + // std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, @@ -625,23 +641,21 @@ index 0000000..9bcf4eb + const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; + + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ false, //is_cross -+ false, // sparse -+ false); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false, // is_cross ++ false, // sparse ++ false); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, -+ 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); + + // GPU RESULTS + desc.output_tensors.push_back(Tensor{ @@ -690,55 +704,53 @@ index 0000000..9bcf4eb + const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; + + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ true, //is_cross -+ false, // sparse -+ false); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ true, // is_cross ++ false, // sparse ++ false); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size*opt_a->seq_len, hidden_units}, -+ 0}); + desc.input_tensors.push_back(Tensor{ -+ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size* opt_a->tgt_seq_len, hidden_units}, 0}); ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0}); + + desc.input_tensors.push_back(Tensor{ + MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size*opt_a->seq_len, hidden_units}, -+ 0}); + desc.input_python_tensors.push_back(Tensor{ -+ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size* opt_a->tgt_seq_len, hidden_units}, 0}); ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0}); + + desc.input_python_tensors.push_back(Tensor{ + MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); + -+ + // GPU RESULTS + + desc.output_tensors.push_back(Tensor{ + MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + + desc.output_python_tensors.push_back(Tensor{ + MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); -+ ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + + desc.w_tensors.push_back( + Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); @@ -759,68 +771,67 @@ index 0000000..9bcf4eb + const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; + + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ false, //is_cross -+ false, // sparse -+ true); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false, // is_cross ++ false, // sparse ++ true); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, -+ 0}); -+ -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0}); ++ ++ desc.input_tensors.push_back( ++ Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); + ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0}); ++ ++ desc.input_python_tensors.push_back( ++ Tensor{MEMORY_CPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); + + // GPU RESULTS + + desc.output_tensors.push_back(Tensor{ + MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, ++ // opt_a->tgt_seq_len},0}); + + desc.output_python_tensors.push_back(Tensor{ + MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, ++ // opt_a->tgt_seq_len}, 0}); + + desc.w_tensors.push_back( + Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, 3 * hidden_units}, 0}); @@ -828,88 +839,89 @@ index 0000000..9bcf4eb + Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); +} + -+template ++template +void InitializeAttnT5Cross(opt_arg* opt_a, -+ DecriptorTest &desc, -+ cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ Allocator* allocator) { ++ DecriptorTest& desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ Allocator* allocator) ++{ + const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; + + desc.Attn = new MSMHALayer(opt_a->batch_size, -+ opt_a->seq_len, -+ opt_a->tgt_seq_len, -+ opt_a->head_num, -+ opt_a->size_per_head, -+ stream, -+ cublas_wrapper, -+ allocator, -+ false, // free buffer after fwd -+ true, // is_qk_buf_float_ -+ true, //is_cross -+ false, // sparse -+ true); // is_position_bias ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ true, // is_cross ++ false, // sparse ++ true); // is_position_bias + -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, -+ 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0}); + + desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); -+ -+ desc.input_tensors.push_back(Tensor{MEMORY_GPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); -+ -+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU, -+ getTensorType(), -+ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, -+ 0}); ++ getTensorType(), ++ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); ++ ++ desc.input_tensors.push_back( ++ Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0}); ++ ++ desc.input_python_tensors.push_back( ++ Tensor{MEMORY_CPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); + ++ desc.input_python_tensors.push_back( ++ Tensor{MEMORY_CPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); + + // GPU RESULTS + + desc.output_tensors.push_back(Tensor{ + MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_tensors.push_back(Tensor{ -+ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},0}); ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, ++ // opt_a->tgt_seq_len},0}); + + desc.output_python_tensors.push_back(Tensor{ + MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, ++ // opt_a->size_per_head}, 0}); + // desc.output_python_tensors.push_back(Tensor{ -+ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, ++ // opt_a->tgt_seq_len}, 0}); + + desc.w_tensors.push_back( + Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); @@ -939,6 +951,7 @@ index 0000000..9bcf4eb + opt_a->eps1, + opt_a->eps2, + opt_a->post_layernorm_residual, ++ opt_a->is_ffn_fp16, + stream, + cublas_wrapper, + cublas_handle, @@ -982,6 +995,100 @@ index 0000000..9bcf4eb +} + +template ++void InitializeDecoder(opt_arg* opt_a, ++ DecriptorDecoderLayer& desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ Allocator* allocator) ++{ ++ const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; ++ std::cout<<"hidden_units: "< ++ desc.Decoder = new MSDLayer(opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ opt_a->ffn_hidden_size, ++ opt_a->eps1, ++ opt_a->eps2, ++ opt_a->eps3, ++ opt_a->post_layernorm_residual, ++ opt_a->position_bias1, ++ opt_a->position_bias2, ++ stream, ++ cublas_wrapper, ++ cublas_handle, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false); // sparse ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU,getTensorType(),std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0}); ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU,getTensorType(),std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0}); ++ ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0}); //G1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0}); //B1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, 3 * hidden_units}, 0});//wt ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ 3 * hidden_units}, 0});//bt ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0});//wp ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units}, 0});//bp ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//g1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//b2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units , hidden_units * 2}, 0});//bt2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units * 3}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0});//wp2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units}, 0});//bp2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//g3 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//b3 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});//wm ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->ffn_hidden_size}, 0});//bm ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});;//wp ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//bp ++} ++ ++template +void Init(opt_arg* opt_a, + DecriptorTest& desc, + cudaStream_t stream, @@ -994,32 +1101,16 @@ index 0000000..9bcf4eb + InitializeAttn(opt_a, desc, stream, cublas_wrapper, allocator); + break; + case MHA_X2: -+ InitializeAttnX2(opt_a, -+ desc, -+ stream, -+ cublas_wrapper, -+ allocator); ++ InitializeAttnX2(opt_a, desc, stream, cublas_wrapper, allocator); + break; + case MHA_CROSS: -+ InitializeAttnCross(opt_a, -+ desc, -+ stream, -+ cublas_wrapper, -+ allocator); ++ InitializeAttnCross(opt_a, desc, stream, cublas_wrapper, allocator); + break; + case MHA_T5: -+ InitializeAttnT5(opt_a, -+ desc, -+ stream, -+ cublas_wrapper, -+ allocator); ++ InitializeAttnT5(opt_a, desc, stream, cublas_wrapper, allocator); + break; + case MHA_T5_CROSS: -+ InitializeAttnT5Cross(opt_a, -+ desc, -+ stream, -+ cublas_wrapper, -+ allocator); ++ InitializeAttnT5Cross(opt_a, desc, stream, cublas_wrapper, allocator); + break; + default: + break; @@ -1043,6 +1134,24 @@ index 0000000..9bcf4eb + } +} + ++template ++void InitD(opt_arg* opt_a, ++ DecriptorDecoderLayer& desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ Allocator* allocator) ++{ ++ int model_num = ModelNum(opt_a->model_name); ++ switch (model_num) { ++ case TDL: ++ InitializeDecoder(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator); ++ break; ++ default: ++ break; ++ } ++} ++ +template +void InitWeight(opt_arg* opt_a, AttentionWeight& attn_weights, std::vector w_tensors) +{ @@ -1059,18 +1168,21 @@ index 0000000..9bcf4eb + attn_weights.key_weight.kernel = (const T*)w_tensors[2].data; + attn_weights.attention_output_weight.kernel = (const T*)w_tensors[3].data; + attn_weights.attention_output_weight.bias = (const T*)w_tensors[4].data; -+ } else if (modelId==MHA_T5) { ++ } ++ else if (modelId == MHA_T5) { + attn_weights.query_weight.kernel = (const T*)w_tensors[0].data; + attn_weights.query_weight.bias = nullptr; + attn_weights.attention_output_weight.kernel = (const T*)w_tensors[1].data; + attn_weights.attention_output_weight.bias = nullptr; -+ } else if (modelId==MHA_T5_CROSS) { ++ } ++ else if (modelId == MHA_T5_CROSS) { + attn_weights.query_weight.kernel = (const T*)w_tensors[0].data; + attn_weights.query_weight.bias = nullptr; + attn_weights.key_weight.kernel = (const T*)w_tensors[1].data; + attn_weights.attention_output_weight.kernel = (const T*)w_tensors[2].data; + attn_weights.attention_output_weight.bias = nullptr; -+ } else { ++ } ++ else { + // return ERROR illegal model ! + } +} @@ -1097,12 +1209,42 @@ index 0000000..9bcf4eb + // return ERROR illegal model ! + } +} ++template ++void InitWeightDecoder(opt_arg* opt_a, DecoderLayerWeight& decoder_weights, std::vector w_tensors) ++{ ++ int modelId = ModelNum(opt_a->model_name); ++ if (modelId == TDL) { ++ decoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data; ++ decoder_weights.layernorm1.beta = (const T*)w_tensors[1].data; ++ decoder_weights.attention_qkv_weight.kernel = (const T*)w_tensors[2].data; ++ decoder_weights.attention_qkv_weight.bias = (const T*)w_tensors[3].data; ++ decoder_weights.attention_layer_output_weight.kernel = (const T*)w_tensors[4].data; ++ decoder_weights.attention_layer_output_weight.bias = (const T*)w_tensors[5].data; ++ decoder_weights.layernorm2.gamma = (const T*)w_tensors[6].data; ++ decoder_weights.layernorm2.beta = (const T*)w_tensors[7].data; ++ decoder_weights.attention_cross_q_weight.kernel = (const T*)w_tensors[8].data; ++ decoder_weights.attention_cross_kv_weight.kernel = (const T*)w_tensors[9].data; ++ decoder_weights.attention_cross_kv_weight.bias = (const T*)w_tensors[10].data; ++ decoder_weights.attention_cross_q_weight.bias = (const T*)w_tensors[10].data; ++ decoder_weights.attention_cross_layer_output_weight.kernel = (const T*)w_tensors[11].data; ++ decoder_weights.attention_cross_layer_output_weight.bias = (const T*)w_tensors[12].data; ++ decoder_weights.layernorm3.gamma = (const T*)w_tensors[13].data; ++ decoder_weights.layernorm3.beta = (const T*)w_tensors[14].data; ++ decoder_weights.decoder_output_mapping.kernel = (const T*)w_tensors[15].data; ++ decoder_weights.decoder_output_mapping.bias = (const T*)w_tensors[16].data; ++ decoder_weights.decoder_output_projection.kernel = (const T*)w_tensors[17].data; ++ decoder_weights.decoder_output_projection.bias = (const T*)w_tensors[18].data; ++ } ++ else { ++ // return ERROR illegal model ! ++ } ++} diff --git a/examples/cpp/ms/ms.cc b/examples/cpp/ms/ms.cc new file mode 100644 -index 0000000..2b12bd5 +index 0000000..b7992c6 --- /dev/null +++ b/examples/cpp/ms/ms.cc -@@ -0,0 +1,591 @@ +@@ -0,0 +1,686 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * @@ -1119,10 +1261,12 @@ index 0000000..2b12bd5 + * limitations under the License. + */ +#include "examples/cpp/ms/initialize.h" -+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" -+#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" -+#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h" -+#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h" ++// #include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" ++// #include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" ++// #include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h" ++// #include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h" ++#include "src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h" ++#include "src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h" +#include "src/fastertransformer/utils/logger.h" +#include +#include @@ -1142,7 +1286,7 @@ index 0000000..2b12bd5 +bool read_args(int argc, char* argv[], opt_arg* opt_a) +{ + int opt; -+ while ((opt = getopt(argc, argv, "b:l:s:t:H:S:p:m:T:W:F:i:w:f:P:e1:e2")) != -1) { ++ while ((opt = getopt(argc, argv, "b:l:s:t:H:S:p:m:T:W:F:i:w:f:P:x:1:2:3")) != -1) { + switch (opt) { + case 'b': + opt_a->batch_size = atoi(optarg); @@ -1183,6 +1327,9 @@ index 0000000..2b12bd5 + case '2': + opt_a->eps2 = atoi(optarg); + break; ++ case '3': ++ opt_a->eps3 = atoi(optarg); ++ break; + case 'P': + if (atoi(optarg) == 1) + opt_a->post_layernorm_residual=true; @@ -1192,6 +1339,12 @@ index 0000000..2b12bd5 + case 'p': + opt_a->is_remove_padding = bool(optarg); + break; ++ case 'x': ++ if (atoi(optarg) == 1) ++ opt_a->is_ffn_fp16=true; ++ else if (atoi(optarg) == 0) ++ opt_a->is_ffn_fp16=false; ++ break; + case 'i': + case 'w': + break; @@ -1222,13 +1375,14 @@ index 0000000..2b12bd5 + opt_a.ffn_hidden_size = -1; + opt_a.eps1 = 1e-6f; + opt_a.eps2 = 1e-6f; ++ opt_a.eps3 = 1e-6f; + opt_a.post_layernorm_residual = true; + opt_a.is_remove_padding = false; + opt_a.model_name = ""; + opt_a.compute_type = "fp32"; + opt_a.w_compute_type = "fp32"; + opt_a.s_compute_type = "fp32"; -+ ++ opt_a.is_ffn_fp16 = false; + + if (read_args(argc, argv, &opt_a)) { + bool c_type_fp32 = (opt_a.compute_type.compare("fp32") == 0); @@ -1540,75 +1694,78 @@ index 0000000..2b12bd5 + cublas_wrapper.setFP32GemmConfig(); + } + } -+ -+ if (opt_a->model_name != "transformer_encoder_layer") { -+ DecriptorTest desc; -+ Init(opt_a, desc, stream, &cublas_wrapper, &allocator); ++ if(opt_a->model_name == "transformer_decoder_layer" || opt_a->model_name == "transformer_decoder_layer_t5") { ++ DecriptorDecoderLayer desc; ++ InitD(opt_a, desc, stream, &cublas_wrapper, &cublas_handle, &allocator); ++ std::cout<<"input: "<(desc.input_tensors, std::string("input"), opt_a); + FT_CHECK(!res); ++ std::cout<<"input_tensors"<(desc.input_python_tensors, std::string("input"), opt_a); + FT_CHECK(!res); ++ std::cout<<"input_python_tensors"<(desc.output_tensors, std::string("output"), opt_a, false); + FT_CHECK(!res); ++ std::cout<<"output_tensors"<(desc.output_python_tensors, std::string("output"), opt_a); + FT_CHECK(!res); ++ std::cout<<"output_python_tensors\n"; + + res = ReadTensors(desc.w_tensors, std::string("weight"), opt_a); + FT_CHECK(!res); -+ -+ std::cout << "inputs size not encoder: " << CalcTensorsSize(desc.input_tensors) << std::endl; -+ std::cout << "weights size not encoder: " << CalcTensorsSize(desc.w_tensors) << std::endl; -+ std::cout << "ouputs size not encoder: " << CalcTensorsSize(desc.output_tensors) << std::endl; -+ -+ AttentionWeight attn_weights; -+ InitWeight(opt_a, attn_weights, desc.w_tensors); -+ -+ // test for BE !! -+ desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); -+ ++ std::cout<<"DecoderLayerWeight\n"; ++ std::cout<<"input: "< decoder_weights; ++ InitWeightDecoder(opt_a, decoder_weights, desc.w_tensors); ++ // // test for BE !! ++ std::cout<<"initDecoderLayerWeight\n"; ++ desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights); + + CompareOutput(desc.output_python_tensors, desc.output_tensors); -+ -+// #define DO_TIME -+// #ifdef DO_TIME -+// // warmup -+// for (int i = 0; i < 10; i++) { -+// desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); -+// } -+// // profile time -+// const int ite = 1000; -+// CudaTimer cuda_timer(stream); -+// cuda_timer.start(); ++#define DO_TIME ++#ifdef DO_TIME ++ // warmup ++ for (int i = 0; i < 10; i++) { ++ // desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights); ++ } ++ // profile time ++ const int ite = 1000; ++ CudaTimer cuda_timer(stream); ++ cuda_timer.start(); + -+// for (int i = 0; i < ite; i++) { -+// for (int i = 0; i < desc.input_tensors.size(); i++) { -+// int size = desc.input_tensors[i].size(); -+// cudaH2Dcpy(const_cast(reinterpret_cast(desc.input_tensors[i].data)), -+// const_cast(reinterpret_cast(desc.input_python_tensors[i].data)), -+// size); -+// } ++ for (int i = 0; i < ite; i++) { ++ // for (int i = 0; i < desc.input_tensors.size(); i++) { ++ // int size = desc.input_tensors[i].size(); ++ // cudaH2Dcpy(const_cast(reinterpret_cast(desc.input_tensors[i].data)), ++ // const_cast(reinterpret_cast(desc.input_python_tensors[i].data)), ++ // size); ++ // } + -+// desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); -+// for (int i = 0; i < desc.output_tensors.size(); i++) { -+// int size = desc.output_tensors[i].size(); -+// cudaD2Hcpy(const_cast(reinterpret_cast(desc.output_python_tensors[i].data)), -+// const_cast(reinterpret_cast(desc.output_tensors[i].data)), -+// size); -+// } -+// } -+// float total_time = cuda_timer.stop(); -+// printf("batch_size %ld seq_len %ld layer %ld " -+// "AVG FT-CPP-time %.2f ms (%d iterations) " -+// "Total Time %.2f ms\n", -+// opt_a->batch_size, -+// opt_a->seq_len, -+// opt_a->num_layers, -+// total_time / ite, -+// ite, -+// total_time); -+// #endif ++ // desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights); ++ // for (int i = 0; i < desc.output_tensors.size(); i++) { ++ // int size = desc.output_tensors[i].size(); ++ // cudaD2Hcpy(const_cast(reinterpret_cast(desc.output_python_tensors[i].data)), ++ // const_cast(reinterpret_cast(desc.output_tensors[i].data)), ++ // size); ++ // } ++ } ++ float total_time = cuda_timer.stop(); ++ ++ printf("batch_size %ld seq_len %ld layer %ld " ++ "AVG FT-CPP-time %.2f ms (%d iterations) " ++ "Total Time %.2f ms\n", ++ opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->num_layers, ++ total_time / ite, ++ ite, ++ total_time); ++#endif + +#ifdef SPARSITY_ENABLED + cusparseLtDestroy(&cusparselt_handle); @@ -1619,8 +1776,9 @@ index 0000000..2b12bd5 + FreeDesc(desc.input_tensors); + FreeDesc(desc.output_python_tensors); + FreeDesc(desc.w_tensors); ++ return 0; + } -+ else { ++ else if (opt_a->model_name == "transformer_encoder_layer") { + DecriptorEncoderLayer desc; + InitE(opt_a, desc, stream, &cublas_wrapper, &cublas_handle, &allocator); + int res = ReadTensors(desc.input_tensors, std::string("input"), opt_a); @@ -1642,7 +1800,7 @@ index 0000000..2b12bd5 + desc.Encoder->forward(&desc.output_tensors, &desc.input_tensors, &encoder_weights); + + CompareOutput(desc.output_python_tensors, desc.output_tensors); -+// #define DO_TIME ++#define DO_TIME +#ifdef DO_TIME + // warmup + for (int i = 0; i < 10; i++) { @@ -1692,16 +1850,95 @@ index 0000000..2b12bd5 + FreeDesc(desc.output_python_tensors); + FreeDesc(desc.w_tensors); + } -+ return 0; -+} -diff --git a/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer b/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer -new file mode 160000 -index 0000000..cbaa0d8 ---- /dev/null -+++ b/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer -@@ -0,0 +1 @@ -+Subproject commit cbaa0d8707db403d85ad0e13c59f2f71cd6db425 -diff --git a/examples/pytorch/vit/ViT-quantization/ViT-pytorch b/examples/pytorch/vit/ViT-quantization/ViT-pytorch ++ else { ++ DecriptorTest desc; ++ Init(opt_a, desc, stream, &cublas_wrapper, &allocator); ++ int res = ReadTensors(desc.input_tensors, std::string("input"), opt_a); ++ FT_CHECK(!res); ++ res = ReadTensors(desc.input_python_tensors, std::string("input"), opt_a); ++ FT_CHECK(!res); ++ ++ res = ReadTensors(desc.output_tensors, std::string("output"), opt_a, false); ++ FT_CHECK(!res); ++ ++ res = ReadTensors(desc.output_python_tensors, std::string("output"), opt_a); ++ FT_CHECK(!res); ++ ++ res = ReadTensors(desc.w_tensors, std::string("weight"), opt_a); ++ FT_CHECK(!res); ++ ++ std::cout << "inputs size not encoder: " << CalcTensorsSize(desc.input_tensors) << std::endl; ++ std::cout << "weights size not encoder: " << CalcTensorsSize(desc.w_tensors) << std::endl; ++ std::cout << "ouputs size not encoder: " << CalcTensorsSize(desc.output_tensors) << std::endl; ++ ++ AttentionWeight attn_weights; ++ InitWeight(opt_a, attn_weights, desc.w_tensors); ++ ++ // test for BE !! ++ desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); ++ ++ ++ CompareOutput(desc.output_python_tensors, desc.output_tensors); ++ ++// #define DO_TIME ++// #ifdef DO_TIME ++// // warmup ++// for (int i = 0; i < 10; i++) { ++// desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); ++// } ++// // profile time ++// const int ite = 1000; ++// CudaTimer cuda_timer(stream); ++// cuda_timer.start(); ++ ++// for (int i = 0; i < ite; i++) { ++// for (int i = 0; i < desc.input_tensors.size(); i++) { ++// int size = desc.input_tensors[i].size(); ++// cudaH2Dcpy(const_cast(reinterpret_cast(desc.input_tensors[i].data)), ++// const_cast(reinterpret_cast(desc.input_python_tensors[i].data)), ++// size); ++// } ++ ++// desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); ++// for (int i = 0; i < desc.output_tensors.size(); i++) { ++// int size = desc.output_tensors[i].size(); ++// cudaD2Hcpy(const_cast(reinterpret_cast(desc.output_python_tensors[i].data)), ++// const_cast(reinterpret_cast(desc.output_tensors[i].data)), ++// size); ++// } ++// } ++// float total_time = cuda_timer.stop(); ++// printf("batch_size %ld seq_len %ld layer %ld " ++// "AVG FT-CPP-time %.2f ms (%d iterations) " ++// "Total Time %.2f ms\n", ++// opt_a->batch_size, ++// opt_a->seq_len, ++// opt_a->num_layers, ++// total_time / ite, ++// ite, ++// total_time); ++// #endif ++ ++#ifdef SPARSITY_ENABLED ++ cusparseLtDestroy(&cusparselt_handle); ++#endif ++ delete cublas_algo_map; ++ delete cublas_wrapper_mutex; ++ FreeDesc(desc.output_tensors); ++ FreeDesc(desc.input_tensors); ++ FreeDesc(desc.output_python_tensors); ++ FreeDesc(desc.w_tensors); ++ } ++ return 0; ++} +diff --git a/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer b/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer +new file mode 160000 +index 0000000..cbaa0d8 +--- /dev/null ++++ b/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer +@@ -0,0 +1 @@ ++Subproject commit cbaa0d8707db403d85ad0e13c59f2f71cd6db425 +diff --git a/examples/pytorch/vit/ViT-quantization/ViT-pytorch b/examples/pytorch/vit/ViT-quantization/ViT-pytorch new file mode 160000 index 0000000..460a162 --- /dev/null @@ -5357,18 +5594,19 @@ index be8b178..e9b4310 100644 + } // namespace fastertransformer diff --git a/src/fastertransformer/layers/CMakeLists.txt b/src/fastertransformer/layers/CMakeLists.txt -index cbaf4fa..00a46d4 100644 +index cbaf4fa..a9fe6e6 100644 --- a/src/fastertransformer/layers/CMakeLists.txt +++ b/src/fastertransformer/layers/CMakeLists.txt -@@ -14,6 +14,7 @@ +@@ -14,6 +14,8 @@ cmake_minimum_required(VERSION 3.8) +add_subdirectory(encoder_layers) ++add_subdirectory(decoder_layers) add_subdirectory(attention_layers) add_subdirectory(attention_layers_int8) add_subdirectory(xlnet_attention_layers) -@@ -30,15 +31,18 @@ set_property(TARGET FfnLayerINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) +@@ -30,15 +32,18 @@ set_property(TARGET FfnLayerINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET FfnLayerINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(FfnLayerINT8 PUBLIC -lcublasLt -lcublas -lcudart cublasMMWrapper cublasINT8MMWrapper activation_int8_kernels memory_utils) @@ -5444,7 +5682,7 @@ index 9cef315..f9c9cde 100644 diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc old mode 100644 new mode 100755 -index bada640..3dca224 +index bada640..e214b82 --- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc @@ -16,10 +16,39 @@ @@ -5513,7 +5751,7 @@ index bada640..3dca224 sync_check_cuda_error(); T scalar = 1 / sqrtf(size_per_head_ * 1.0f); invokeMaskedSoftMax(qk_buf_, -@@ -428,4 +456,148 @@ template class GptContextAttentionLayer; +@@ -428,4 +456,146 @@ template class GptContextAttentionLayer; template class GptContextAttentionLayer<__nv_bfloat16>; #endif @@ -5535,7 +5773,7 @@ index bada640..3dca224 + bool is_position_bias): + BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse) +{ -+cublasHandle_t cublas_handle; ++ cublasHandle_t cublas_handle; + cublasCreate(&cublas_handle); + cublasSetStream(cublas_handle, stream); + @@ -5555,7 +5793,6 @@ index bada640..3dca224 + params_.position_bias = is_position_bias; + params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; +} -+ +template +void MSMHALayer::allocateBuffer() +{ @@ -5564,164 +5801,1232 @@ index bada640..3dca224 + buf_ = reinterpret_cast(allocator_->reMalloc(buf_, buff_size, true)); + } +} ++template ++void MSMHALayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const AttentionWeight* attention_weights) ++{ ++ // input_tensors: use 1 gemm -- multi head attention ++ // input_query [batch_size * seq_len, hidden_dimension] ++ // attention_mask [batch_size, 1, seq_len, seq_len] ++ ++ // input_tensors: use 2 gemm -- cross attention ++ // input_query [batch_size * seq_len, hidden_dimension] ++ // enc_output [batch_size * tgt_len, hidden_dimension] ++ // attention_mask [batch_size, 1, seq_len, seq_len] ++ ++ // output_tensors: ++ // attention_out [batch_size * seq_len, hidden_dimension] ++ // key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x] ++ // value_cache [batch, local_head_num, max_seq_len, size_per_head] ++ ++ int in_tensor_number = input_tensors->size(); ++ allocateBuffer(); // only once ++ if (params_.position_bias) ++ if (params_.is_cross) { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)input_tensors->at(1).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)attention_weights->key_weight.kernel, ++ (void*)input_tensors->at(2).data, ++ (void*)input_tensors->at(3).data, ++ (void*)attention_weights->attention_output_weight.kernel}; ++ forward_attn((T**)inputs, 7, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ else { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = { ++ (void*)input_tensors->at(0).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)input_tensors->at(1).data, ++ (void*)input_tensors->at(2).data, ++ (void*)attention_weights->attention_output_weight.kernel ++ }; ++ forward_attn((T**)inputs, 5, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ else { ++ if (params_.is_cross) { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)input_tensors->at(1).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)attention_weights->key_weight.kernel, ++ (void*)attention_weights->query_weight.bias, ++ (void*)input_tensors->at(2).data, ++ (void*)attention_weights->attention_output_weight.kernel, ++ (void*)attention_weights->attention_output_weight.bias ++ }; ++ forward_attn((T**)inputs, 8, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ else { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)attention_weights->query_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)attention_weights->attention_output_weight.kernel, ++ (void*)attention_weights->attention_output_weight.bias}; ++ forward_attn((T**)inputs, 6, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ } ++} ++ ++ template ++ MSMHALayer::~MSMHALayer() ++ { ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++ } ++ ++ template ++ void MSMHALayer::freeBuffer() ++ { ++ if (buf_ != nullptr) { ++ allocator_->free(buf_); ++ buf_ = nullptr; ++ } ++ } ++ ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ + } // namespace fastertransformer +diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h +old mode 100644 +new mode 100755 +index 92e2175..f7fa5ca +--- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h ++++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h +@@ -18,7 +18,7 @@ + #pragma once + + #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" +- ++#include "src/fastertransformer/layers/encoder_layers/encoder.h" + namespace fastertransformer { + + template +@@ -107,4 +107,44 @@ public: + const AttentionWeight* attention_weights) override; + }; + ++ ++// TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++template ++class MSMHALayer: public BaseAttentionLayer { ++private: ++ void allocateBuffer() override; ++ void freeBuffer() override; ++ ++ using BaseAttentionLayer::is_free_buffer_after_forward_; ++ using BaseAttentionLayer::is_allocate_buffer_; ++ using BaseAttentionLayer::cublas_wrapper_; ++ using BaseAttentionLayer::allocator_; ++ ++protected: ++ using BaseAttentionLayer::stream_; ++ using BaseAttentionLayer::sparse_; ++ T* buf_ = nullptr; ++ encoderParamT params_; ++ ++public: ++ MSMHALayer(size_t batch_size, ++ size_t src_seq_len, ++ size_t tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool is_cross, ++ bool sparse = false, ++ bool is_position_bias=false); ++ MSMHALayer(MSMHALayer const& attention_layer); ++ virtual ~MSMHALayer(); ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const AttentionWeight* attention_weights) override; ++}; ++ + } // namespace fastertransformer +diff --git a/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h b/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h +new file mode 100644 +index 0000000..849e137 +--- /dev/null ++++ b/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h +@@ -0,0 +1,76 @@ ++/* ++ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include ++#include ++ ++#include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" ++#include "src/fastertransformer/layers/BaseLayer.h" ++#include "src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h" ++#include "src/fastertransformer/utils/Tensor.h" ++#include "src/fastertransformer/utils/allocator.h" ++#include "src/fastertransformer/utils/cublasMMWrapper.h" ++#include "src/fastertransformer/utils/memory_utils.h" ++ ++namespace fastertransformer { ++ ++enum class DecoderLayerType { ++ UNFUSED_DECODER_LAYER, ++ FUSED_DECODER_LAYER ++}; ++ ++template ++DecoderLayerType getDecoderLayerType(size_t size_per_head, const int sm, const bool remove_padding, ++ const int max_seq_len, const bool is_fuse = true) { ++ if (std::is_same::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72) ++ && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) { ++ return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++ } else { ++ return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++ } ++} ++ ++template ++DecoderLayerType getDecoderLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, ++ const int max_seq_len, const int int8_mode) { ++ if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64 ++ && max_seq_len <= 384) { ++ return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++ } else { ++ return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++ } ++} ++ ++template ++class BaseDecoderLayer: public BaseLayer { ++ ++public: ++ virtual void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const DecoderLayerWeight* decoder_layer_weights) = 0; ++ BaseDecoderLayer(cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool sparse = false): ++ BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse) ++ { ++ } ++ virtual ~BaseDecoderLayer() = default; ++}; ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/decoder_layers/CMakeLists.txt b/src/fastertransformer/layers/decoder_layers/CMakeLists.txt +new file mode 100644 +index 0000000..e343db9 +--- /dev/null ++++ b/src/fastertransformer/layers/decoder_layers/CMakeLists.txt +@@ -0,0 +1,21 @@ ++# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++ ++cmake_minimum_required(VERSION 3.8) ++ ++add_library(DecoderLayer STATIC decoder.cc MSDecoderLayer.cc) ++set_property(TARGET DecoderLayer PROPERTY POSITION_INDEPENDENT_CODE ON) ++set_property(TARGET DecoderLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) ++target_link_libraries(DecoderLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels ++ layernorm_kernels add_residual_kernels bert_preprocess_kernels) +diff --git a/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h b/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h +new file mode 100644 +index 0000000..5c73512 +--- /dev/null ++++ b/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h +@@ -0,0 +1,37 @@ ++/* ++ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/DenseWeight.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++namespace fastertransformer { ++ ++template ++struct DecoderLayerWeight { ++ DenseWeight attention_qkv_weight; ++ DenseWeight attention_layer_output_weight; ++ DenseWeight attention_cross_q_weight; ++ DenseWeight attention_cross_kv_weight; ++ DenseWeight attention_cross_layer_output_weight; ++ DenseWeight decoder_output_mapping; ++ DenseWeight decoder_output_projection; ++ LayerNormWeight layernorm1; ++ LayerNormWeight layernorm2; ++ LayerNormWeight layernorm3; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc +new file mode 100644 +index 0000000..c4ca79b +--- /dev/null ++++ b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc +@@ -0,0 +1,207 @@ ++/* ++ * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++ ++namespace fastertransformer { ++template ++void printTensor(char* str, T* input, int size) ++{ ++ printf("%s ", str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++template ++MSDLayer::MSDLayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ float eps3, ++ bool post_layernorm, ++ bool position_bias1, ++ bool position_bias2, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse): ++ ++ BaseDecoderLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr) ++{ ++ params_.batch_size = max_batch_size; ++ params_.src_seq_len = max_src_seq_len; ++ params_.tgt_seq_len = max_tgt_seq_len; ++ params_.head_num = head_num; ++ params_.head_size = size_per_head; ++ params_.hidden_size = head_num * size_per_head; ++ params_.ffn_hidden_size = ffn_hidden_size; ++ params_.eps1 = eps1; ++ params_.eps2 = eps2; ++ params_.eps3 = eps3; ++ params_.layernorm_post = post_layernorm; ++ // handle ++ params_.cublas_handle = *cublas_handle; ++ params_.stream = stream; ++ params_.ffn_fp16 = false; ++ // ctrls ++ params_.in_idx = 0; ++ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.projection_bias = true; ++ ++ params_.attn1.in_idx = 0; ++ params_.attn1.batch_size = max_batch_size; ++ params_.attn1.src_seq_len = max_src_seq_len; ++ params_.attn1.tgt_seq_len = max_tgt_seq_len; ++ params_.attn1.head_num = head_num; ++ params_.attn1.head_size = size_per_head; ++ params_.attn1.hidden_size = head_num * size_per_head; ++ params_.attn1.qkv_bias = true; ++ params_.attn1.projection_bias = false; ++ params_.attn1.is_cross = false; ++ params_.attn1.position_bias = false; ++ params_.attn1.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn1.cublas_handle = *cublas_handle; ++ params_.attn1.stream = stream; ++ ++ params_.attn2.in_idx = 0; ++ params_.attn2.batch_size = max_batch_size; ++ params_.attn2.src_seq_len = max_src_seq_len; ++ params_.attn2.tgt_seq_len = max_tgt_seq_len; ++ params_.attn2.head_num = head_num; ++ params_.attn2.head_size = size_per_head; ++ params_.attn2.hidden_size = head_num * size_per_head; ++ params_.attn2.qkv_bias = true; ++ params_.attn2.projection_bias = false; ++ params_.attn2.is_cross = true; ++ params_.attn2.position_bias = false; ++ params_.attn2.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn2.cublas_handle = *cublas_handle; ++ params_.attn2.stream = stream; ++} ++ ++template ++void MSDLayer::allocateBuffer() ++{ ++ if (buf_ == nullptr) { ++ size_t buff_size = GetDecoderLayerWorkspaceSize(¶ms_); ++ std::cout<<"buff_size: "<(allocator_->reMalloc(buf_, buff_size, true)); ++ } ++} ++ ++template ++void MSDLayer::freeBuffer() ++{ ++ if (buf_ != nullptr) { ++ allocator_->free(buf_); ++ buf_ = nullptr; ++ } ++} ++ ++template ++MSDLayer::~MSDLayer() ++{ ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++} ++ ++template ++void MSDLayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const DecoderLayerWeight* decoder_weights) ++{ ++ std::cout<<"forward\n"; ++ allocateBuffer(); // only once ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ // std::cout<qkv_bias<< params_.attn2->qkv_bias<< !params_.attn1->position_bias<< !params_.attn2->position_bias<at(0).data, ++ (void*)decoder_weights->layernorm1.gamma, ++ (void*)decoder_weights->layernorm1.beta, ++ (void*)decoder_weights->attention_qkv_weight.kernel, ++ (void*)decoder_weights->attention_qkv_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)decoder_weights->attention_layer_output_weight.kernel, ++ (void*)decoder_weights->attention_layer_output_weight.bias, ++ (void*)decoder_weights->layernorm2.gamma, ++ (void*)decoder_weights->layernorm2.beta, ++ (void*)input_tensors->at(2).data, ++ (void*)decoder_weights->attention_cross_q_weight.kernel, ++ (void*)decoder_weights->attention_cross_kv_weight.kernel, ++ (void*)decoder_weights->attention_cross_q_weight.bias, ++ (void*)input_tensors->at(3).data, ++ (void*)decoder_weights->attention_cross_layer_output_weight.kernel, ++ (void*)decoder_weights->attention_cross_layer_output_weight.bias, ++ (void*)decoder_weights->layernorm3.gamma, ++ (void*)decoder_weights->layernorm3.beta, ++ (void*)decoder_weights->decoder_output_mapping.kernel, ++ (void*)decoder_weights->decoder_output_mapping.bias, ++ (void*)decoder_weights->decoder_output_projection.kernel, ++ (void*)decoder_weights->decoder_output_projection.bias}; ++ forwardDecoder(inputs, 23, outputs, 1, ¶ms_, buf_); ++ // } ++ // else { ++ // void* inputs[] = {(void*)input_tensors->at(0).data, ++ // (void*)decoder_weights->qkv_weight.kernel, ++ // (void*)decoder_weights->qkv_weight.bias, ++ // (void*)input_tensors->at(1).data, ++ // (void*)decoder_weights->attention_layer_output_weight.kernel, ++ // (void*)decoder_weights->attention_layer_output_weight.bias, ++ // (void*)decoder_weights->layernorm1.gamma, ++ // (void*)decoder_weights->layernorm1.beta, ++ // (void*)decoder_weights->decoder_output_mapping.kernel, ++ // (void*)decoder_weights->decoder_output_mapping.bias, ++ // (void*)decoder_weights->decoder_output_projection.kernel, ++ // (void*)decoder_weights->decoder_output_projection.bias, ++ // (void*)decoder_weights->layernorm2.gamma, ++ // (void*)decoder_weights->layernorm2.beta}; ++ // forwardDecoder(inputs, 3, outputs, 1, ¶ms_, buf_); ++ // } ++ } ++ return; ++} ++ ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h +new file mode 100644 +index 0000000..3f7e9cb +--- /dev/null ++++ b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h +@@ -0,0 +1,73 @@ ++/* ++ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h" ++#include "src/fastertransformer/layers/decoder_layers/decoder.h" ++ ++namespace fastertransformer { ++ ++// TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++template ++class MSDLayer: public BaseDecoderLayer { ++private: ++ mutable decoderParamT params_; ++ ++ void allocateBuffer() override; ++ void freeBuffer() override; ++ void* buf_; ++ using BaseDecoderLayer::is_free_buffer_after_forward_; ++ using BaseDecoderLayer::is_allocate_buffer_; ++ using BaseDecoderLayer::cublas_wrapper_; ++ using BaseDecoderLayer::allocator_; ++ ++protected: ++ using BaseDecoderLayer::stream_; ++ using BaseDecoderLayer::sparse_; ++ ++public: ++ MSDLayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ float eps3, ++ bool post_layernorm, ++ bool position_bias1, ++ bool position_bias2, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse); ++ ++ MSDLayer(MSDLayer const& decoder_layer); ++ ++ virtual ~MSDLayer(); ++ ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const DecoderLayerWeight* decoder_weights) override; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/decoder_layers/decoder.cc b/src/fastertransformer/layers/decoder_layers/decoder.cc +new file mode 100644 +index 0000000..9b20c74 +--- /dev/null ++++ b/src/fastertransformer/layers/decoder_layers/decoder.cc +@@ -0,0 +1,543 @@ ++ ++#include "src/fastertransformer/layers/decoder_layers/decoder.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include "src/fastertransformer/layers/encoder_layers/encoder.h" ++ ++#include ++namespace fastertransformer { ++ ++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) ++// #define UP_DIV(x, y) (x) ++#define ALIGN_SIZE 16 ++ ++template ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++ ++template ++void isNan(char* str, T* input, int size) ++{ ++ std::cout << str << " " << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } ++ } ++ ++ std::cout << std::endl; ++ free(input_host); ++} ++ ++template ++size_t GetAttnWorkspaceSize(decoderParamT* param) ++{ ++ size_t size_q = UP_DIV((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_k = UP_DIV((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_v = size_k; ++ size_t qkv_len = size_q + size_k + size_v; ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ size_t attn_out_size = ++ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T); ++ ++} ++ ++template size_t GetAttnWorkspaceSize(decoderParamT* param); ++template size_t GetAttnWorkspaceSize(decoderParamT* param); ++template ++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param) ++{ ++ size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ size_t attn2_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ ++ size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); ++ size_t out_size = (param->layernorm_post) ? attn_out + attn2_out : attn_out * 2+ attn2_out * 2; ++ return (std::max(GetAttnWorkspaceSize(param) * 2, ffn_size * sizeof(T)) + out_size * sizeof(T)); ++} ++ ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++template ++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, decoderParamT* param, void* ws) ++{ ++ size_t inter_size = param->ffn_hidden_size; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; ++ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; ++ if ((std::is_same::value) || (std::is_same::value)) { ++ gemm_data_types[0] = CUDA_R_16F; ++ gemm_data_types[1] = CUDA_R_16F; ++ gemm_data_types[2] = CUDA_R_16F; ++ } ++ S alpha = 1.0f; ++ S beta = 0.0f; ++ ++ int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size}; ++ int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size}; ++ T* normed_attn_out = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(inputs[param->in_idx++], ++ normed_attn_out, ++ ws, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ invokeAddBiasGelu(reinterpret_cast(ws), ++ reinterpret_cast(inputs[param->in_idx++]), ++ h_token_num, ++ inter_size, ++ param->stream); ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = h_token_num; ++ gemm_dims[2] = inter_size; ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = inter_size; ++ gemm_lds[2] = param->hidden_size; ++ fastertransformer::CublasGemmWrapper(inputs[param->in_idx++], ++ ws, ++ output[0], ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++} ++ ++template ++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws) ++{ ++ std::cout<<"param->layernorm_post"<layernorm_post<in_idx = 0; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ T* attn_out = reinterpret_cast(ws); ++ T* normed_from_tensor = reinterpret_cast(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn_out = (param->layernorm_post) ? normed_from_tensor + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE * 2 : normed_from_tensor; ++ T* attn_ws_offset = (param->layernorm_post) ? reinterpret_cast(normed_attn_out) : reinterpret_cast(normed_from_tensor); ++ T* attn_ws = attn_ws_offset + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ ++ T* attn2_out = reinterpret_cast(attn_ws) + GetAttnWorkspaceSize(param); ++ T* normed_from_tensor2 = reinterpret_cast(attn2_out) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn2_out = (param->layernorm_post) ? normed_from_tensor2 + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE * 2 : normed_from_tensor2; ++ T* attn2_ws_offset = (param->layernorm_post) ? reinterpret_cast(normed_attn2_out) : reinterpret_cast(normed_from_tensor2); ++ T* attn2_ws = attn2_ws_offset + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ ++ T* ffn_ws = normed_attn2_out + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* tmp_out = reinterpret_cast(output[0]); ++ if (std::is_same::value && param->ffn_fp16==true) { ++ tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ } ++ T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta1 = reinterpret_cast(inputs[param->in_idx++]); ++ invokeGeneralLayerNorm(normed_from_tensor, ++ reinterpret_cast(from_tensor), // from tensor ++ gamma1, // Gamma ++ beta1, // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ inputs[--param->in_idx] = normed_from_tensor; ++ // if attention is embedded inside an decoder - fuse the bias to next layer normalization ++ int in_idx = param->in_idx; ++ std::cout<<"1"<(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws); ++ std::cout<<"2"<in_idx = param->attn1.in_idx + in_idx; ++ if (param->projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta2 = reinterpret_cast(inputs[param->in_idx++]); ++ if (param->layernorm_post == false) { ++ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, ++ normed_attn_out, ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ std::cout<<"3"<hidden_size, ++ param->stream, ++ param->eps2); ++ } ++ } ++ else { ++ // without projection bias ++ } ++ inputs[--param->in_idx] = normed_attn_out; ++ in_idx = param->in_idx; ++ forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws); ++ std::cout<<"4"<in_idx = param->attn2.in_idx + in_idx; ++ if (param->projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma3 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta3 = reinterpret_cast(inputs[param->in_idx++]); ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeGeneralAddBiasResidualPreLayerNorm(attn2_out, ++ normed_attn2_out, ++ attn_out, ++ gamma3, // gamma ++ beta3, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps3); ++ std::cout<<"5"<(attn2_out, ++ reinterpret_cast(normed_attn2_out), ++ attn_out, ++ gamma3, // gamma ++ beta3, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps3); ++ ++ } ++ } ++ else { ++ // without projection bias ++ } ++ inputs[--param->in_idx] = normed_attn2_out; ++ if (param->ffn_fp16==false) { ++ forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ std::cout<<"6"<(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } ++ if (param->layernorm_post == true) { ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidual(reinterpret_cast(tmp_out), ++ normed_attn2_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } ++ else { ++ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } ++ } else { ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidual(reinterpret_cast(tmp_out), ++ attn2_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } ++ else { ++ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } ++ } ++ ++ return; ++} ++ ++template void ++forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++template void ++forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++ ++template ++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws) ++{ ++ param->in_idx = 0; ++ auto extra_tmp_size = ++ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_q = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_k = UP_DIV(param->batch_size * param->tgt_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_v = size_k; ++ size_t qkv_len = size_q + size_k + size_v; ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ auto buff_size = qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len; ++ T* qkv_buf = (T*)ws; ++ T* q_buf_2 = static_cast(qkv_buf) + qkv_len; ++ T* qk_buf = static_cast(q_buf_2) + q_buf_2_len; ++ T* qkv_buf_2 = static_cast(qk_buf) + qk_buf_len; ++ T* qkv_buf_3 = static_cast(qkv_buf_2) + qkv_buf_2_len; ++ T* output1 = static_cast(ws) + buff_size; ++ T* output2 = static_cast(output1) + extra_tmp_size; ++ int gemm_dims[] = { ++ 3 * (int)param->hidden_size, (int)param->batch_size * (int)param->src_seq_len, (int)param->hidden_size}; ++ int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size}; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; ++ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; ++ if (std::is_same::value) { ++ gemm_data_types[0] = CUDA_R_16F; ++ gemm_data_types[1] = CUDA_R_16F; ++ gemm_data_types[2] = CUDA_R_16F; ++ } ++ T alpha = 1.0f; ++ T beta = 0.0f; ++ if (param->is_cross) { ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->src_seq_len; ++ gemm_dims[2] = param->hidden_size; ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = param->hidden_size; ++ gemm_lds[2] = param->hidden_size; ++ T* decoder_output = reinterpret_cast(inputs[param->in_idx++]); ++ T* weight_q = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(weight_q, ++ from_tensor, ++ qkv_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ gemm_dims[0] = 2 * param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->tgt_seq_len; ++ gemm_lds[0] = 2 * param->hidden_size; ++ gemm_lds[2] = 2 * param->hidden_size; ++ T* weight_kv = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(weight_kv, ++ decoder_output, ++ qkv_buf + (param->batch_size * param->src_seq_len) * param->hidden_size, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ invokeCrossAddFusedQKVBiasTranspose(q_buf_2, ++ output1, ++ output2, ++ qkv_buf, ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); ++ } ++ else { ++ T* weight_qkv = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(weight_qkv, ++ from_tensor, ++ qkv_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ ++ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ fastertransformer::invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), ++ static_cast(output1), ++ static_cast(output2), ++ static_cast(qkv_buf), ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ 0, ++ param->stream); ++ ++ } ++ gemm_ops[0] = CUBLAS_OP_T; ++ ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->head_size; ++ gemm_lds[2] = param->tgt_seq_len; ++ ++ int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->tgt_seq_len)}; ++ ++ gemm_dims[0] = param->tgt_seq_len; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->head_size; ++ ++ fastertransformer::CublasGemmStridedBatchedWrapper(output1, ++ q_buf_2, ++ qk_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ ++ T* attention_mask = reinterpret_cast(inputs[param->in_idx++]); ++ T* position_bias = nullptr; ++ if (param->position_bias) { ++ position_bias = reinterpret_cast(inputs[param->in_idx++]); ++ } ++ T scalar = static_cast(1.0f / sqrtf(param->head_size * 1.0f)); ++ fastertransformer::invokeMixMaskedSoftMax(static_cast(qk_buf), ++ attention_mask, ++ position_bias, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ scalar, ++ param->stream); ++ ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->head_size; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->tgt_seq_len; + -+template -+void MSMHALayer::forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const AttentionWeight* attention_weights) -+{ -+ // input_tensors: use 1 gemm -- multi head attention -+ // input_query [batch_size * seq_len, hidden_dimension] -+ // attention_mask [batch_size, 1, seq_len, seq_len] ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->tgt_seq_len; ++ gemm_lds[2] = param->head_size; + -+ // input_tensors: use 2 gemm -- cross attention -+ // input_query [batch_size * seq_len, hidden_dimension] -+ // enc_output [batch_size * tgt_len, hidden_dimension] -+ // attention_mask [batch_size, 1, seq_len, seq_len] ++ gemm_strides[0] = param->tgt_seq_len * param->head_size; ++ gemm_strides[1] = param->src_seq_len * param->tgt_seq_len; ++ gemm_strides[2] = param->src_seq_len * param->head_size; ++ fastertransformer::CublasGemmStridedBatchedWrapper(output2, ++ qk_buf, ++ qkv_buf_2, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ invokeTransposeQKV(static_cast(qkv_buf_3), ++ static_cast(qkv_buf_2), ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); + -+ // output_tensors: -+ // attention_out [batch_size * seq_len, hidden_dimension] -+ // key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x] -+ // value_cache [batch, local_head_num, max_seq_len, size_per_head] ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->src_seq_len; ++ gemm_dims[2] = param->hidden_size; + -+ int in_tensor_number = input_tensors->size(); -+ allocateBuffer(); // only once -+ if (params_.position_bias) -+ if (params_.is_cross) { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)input_tensors->at(1).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)attention_weights->key_weight.kernel, -+ (void*)input_tensors->at(2).data, -+ (void*)input_tensors->at(3).data, -+ (void*)attention_weights->attention_output_weight.kernel}; ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = param->hidden_size; ++ gemm_lds[2] = param->hidden_size; + -+ forward_attn((T**)inputs, 7, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } -+ else { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = { -+ (void*)input_tensors->at(0).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)input_tensors->at(1).data, -+ (void*)input_tensors->at(2).data, -+ (void*)attention_weights->attention_output_weight.kernel -+ }; -+ forward_attn((T**)inputs, 5, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } -+ else { -+ if (params_.is_cross) { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)input_tensors->at(1).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)attention_weights->key_weight.kernel, -+ (void*)attention_weights->query_weight.bias, -+ (void*)input_tensors->at(2).data, -+ (void*)attention_weights->attention_output_weight.kernel, -+ (void*)attention_weights->attention_output_weight.bias -+ }; -+ forward_attn((T**)inputs, 8, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } else { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)attention_weights->query_weight.bias, -+ (void*)input_tensors->at(1).data, -+ (void*)attention_weights->attention_output_weight.kernel, -+ (void*)attention_weights->attention_output_weight.bias}; -+ forward_attn((T**)inputs, 6, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } ++ fastertransformer::CublasGemmWrapper(reinterpret_cast(inputs[param->in_idx++]), ++ qkv_buf_3, ++ static_cast(output[0]), ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ ++ if (param->projection_bias) { ++ int len = param->batch_size * param->src_seq_len; ++ invokeAddBias( ++ static_cast(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream); + } ++ return; +} + -+template -+MSMHALayer::~MSMHALayer() -+{ -+ cublas_wrapper_ = nullptr; -+ freeBuffer(); -+} ++template void ++forward_attn(float* inputs[], int in_len, float* output[], int out_len, attentionParamT* param, void* ws); ++template void ++forward_attn(half* inputs[], int in_len, half* output[], int out_len, attentionParamT* param, void* ws); + -+template -+void MSMHALayer::freeBuffer() -+{ -+ if (buf_ != nullptr) { -+ allocator_->free(buf_); -+ buf_ = nullptr; -+ } -+} ++template void ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, decoderParamT* param, void* ws); ++template void ++forward_ffn(half* inputs[], int in_len, half* output[], int out_len, decoderParamT* param, void* ws); ++template void ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, decoderParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/decoder_layers/decoder.h b/src/fastertransformer/layers/decoder_layers/decoder.h +new file mode 100644 +index 0000000..750b9b0 +--- /dev/null ++++ b/src/fastertransformer/layers/decoder_layers/decoder.h +@@ -0,0 +1,70 @@ ++#pragma once + -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; -+template class MSMHALayer; ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h" ++#include ++#include + - } // namespace fastertransformer -diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h -old mode 100644 -new mode 100755 -index 92e2175..f7fa5ca ---- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h -+++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h -@@ -18,7 +18,7 @@ - #pragma once - - #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" -- -+#include "src/fastertransformer/layers/encoder_layers/encoder.h" - namespace fastertransformer { - - template -@@ -107,4 +107,44 @@ public: - const AttentionWeight* attention_weights) override; - }; - ++namespace fastertransformer { + -+// TODO(haim): Add template according to "mix" compute type (fp32, fp16) -+template -+class MSMHALayer: public BaseAttentionLayer { -+private: -+ void allocateBuffer() override; -+ void freeBuffer() override; + -+ using BaseAttentionLayer::is_free_buffer_after_forward_; -+ using BaseAttentionLayer::is_allocate_buffer_; -+ using BaseAttentionLayer::cublas_wrapper_; -+ using BaseAttentionLayer::allocator_; ++typedef struct { ++ size_t batch_size; ++ size_t src_seq_len; ++ size_t tgt_seq_len; ++ size_t head_num; ++ size_t head_size; ++ size_t hidden_size; ++ size_t h_token_num; ++ // handle ++ cublasHandle_t cublas_handle; ++ cudaStream_t stream; ++ cublasGemmAlgo_t algo; ++ // ctrls ++ int in_idx; ++ bool qkv_bias; // ture ++ bool projection_bias; // ture ++ bool is_cross; // false ++ bool position_bias; ++ int *padding_offset; ++} attentionParamT; + -+protected: -+ using BaseAttentionLayer::stream_; -+ using BaseAttentionLayer::sparse_; -+ T* buf_ = nullptr; -+ encoderParamT params_; ++typedef struct { ++ size_t batch_size; ++ size_t src_seq_len; ++ size_t tgt_seq_len; ++ size_t head_num; ++ size_t head_size; ++ size_t hidden_size; ++ size_t h_token_num; ++ size_t ffn_hidden_size; // 4 * param->hidden_size; ++ bool ffn_fp16; ++ float eps1; ++ float eps2; ++ float eps3; ++ // handle ++ cublasHandle_t cublas_handle; ++ cudaStream_t stream; ++ cublasGemmAlgo_t algo; ++ // ctrls ++ bool projection_bias; // ture + -+public: -+ MSMHALayer(size_t batch_size, -+ size_t src_seq_len, -+ size_t tgt_seq_len, -+ size_t head_num, -+ size_t size_per_head, -+ cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ IAllocator* allocator, -+ bool is_free_buffer_after_forward, -+ bool is_qk_buf_float, -+ bool is_cross, -+ bool sparse = false, -+ bool is_position_bias=false); -+ MSMHALayer(MSMHALayer const& attention_layer); -+ virtual ~MSMHALayer(); -+ void forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const AttentionWeight* attention_weights) override; -+}; ++ int in_idx; ++ mutable attentionParamT attn1; ++ mutable attentionParamT attn2; ++ bool layernorm_post; ++ int *padding_offset; ++} decoderParamT; + - } // namespace fastertransformer ++template ++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++template ++size_t GetAttnWorkspaceSize(decoderParamT* param); ++template ++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, decoderParamT* param, void* ws); ++template ++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++// void forwardDecoder(std::vector > const* ++// inputs); ++} // namespace fastertransformer diff --git a/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h new file mode 100644 index 0000000..3b43391 @@ -5872,10 +7177,10 @@ index 0000000..c441b23 +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc new file mode 100644 -index 0000000..a3442da +index 0000000..6c8f8d7 --- /dev/null +++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc -@@ -0,0 +1,164 @@ +@@ -0,0 +1,165 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. @@ -5927,6 +7232,7 @@ index 0000000..a3442da + float eps1, + float eps2, + bool post_layernorm, ++ bool is_ffn_fp16, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + cublasHandle_t* cublas_handle, @@ -5950,7 +7256,7 @@ index 0000000..a3442da + // handle + params_.cublas_handle = *cublas_handle; + params_.stream = stream; -+ params_.ffn_fp16 = true; ++ params_.ffn_fp16 = is_ffn_fp16; + // ctrls + params_.in_idx = 0; + params_.qkv_bias = true; @@ -6042,10 +7348,10 @@ index 0000000..a3442da +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h new file mode 100644 -index 0000000..afc6a5a +index 0000000..af2a82c --- /dev/null +++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h -@@ -0,0 +1,69 @@ +@@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. @@ -6097,6 +7403,7 @@ index 0000000..afc6a5a + float eps1, + float eps2, + bool post_layernorm, ++ bool is_ffn_fp16, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + cublasHandle_t* cublas_handle, @@ -6117,10 +7424,10 @@ index 0000000..afc6a5a +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/encoder_layers/encoder.cc b/src/fastertransformer/layers/encoder_layers/encoder.cc new file mode 100644 -index 0000000..b106b55 +index 0000000..f33eeb2 --- /dev/null +++ b/src/fastertransformer/layers/encoder_layers/encoder.cc -@@ -0,0 +1,632 @@ +@@ -0,0 +1,647 @@ + +#include "src/fastertransformer/layers/encoder_layers/encoder.h" +#include "src/fastertransformer/kernels/activation_kernels.h" @@ -6291,10 +7598,15 @@ index 0000000..b106b55 + size_t size_k = UP_DIV((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; + size_t size_v = size_k; + size_t qkv_len = size_q + size_k + size_v; -+ size_t qk_buf_len = UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; + size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t attn_out_size = UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; -+ return (qkv_buf_2_len + 2 * attn_out_size + std::max(qkv_len, qk_buf_len)) * sizeof(T); ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ size_t attn_out_size = ++ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T); ++ +} + +template size_t GetAttnWorkspaceSize(encoderParamT* param); @@ -6304,7 +7616,9 @@ index 0000000..b106b55 +{ + size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; + size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ return (std::max(GetAttnWorkspaceSize(param), ffn * sizeof(T)) + (attn_out * 3) * sizeof(T)); ++ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); ++ size_t out_size = (param->layernorm_post) ? attn_out : attn_out * 2; ++ return (std::max(GetAttnWorkspaceSize(param), ffn_size * sizeof(T)) + out_size * sizeof(T)); +} + +template size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); @@ -6367,7 +7681,7 @@ index 0000000..b106b55 +void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws) +{ + param->in_idx = 0; -+ ++ std::cout<<"fp16"<ffn_fp16<batch_size * param->src_seq_len; + T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); + T* attn_out = reinterpret_cast(ws); @@ -6533,17 +7847,23 @@ index 0000000..b106b55 + auto extra_tmp_size = + UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; + size_t size_q = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_k = UP_DIV(param->batch_size * param->tgt_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_v = size_k; ++ ++ size_t qkv_len = size_q + size_k + size_v; + size_t q_buf_2_len = size_q; + size_t qk_buf_len = + UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; + size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ T* q_buf_2 = (T*)ws; -+ T* output1 = static_cast(ws) + q_buf_2_len; ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ auto buff_size = qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len; ++ T* qkv_buf = (T*)ws; ++ T* q_buf_2 = static_cast(qkv_buf) + qkv_len; ++ T* qk_buf = static_cast(q_buf_2) + q_buf_2_len; ++ T* qkv_buf_2 = static_cast(qk_buf) + qk_buf_len; ++ T* qkv_buf_3 = static_cast(qkv_buf_2) + qkv_buf_2_len; ++ T* output1 = static_cast(ws) + buff_size; + T* output2 = static_cast(output1) + extra_tmp_size; -+ T* qkv_buf = static_cast(output2) + extra_tmp_size; -+ T* qk_buf = qkv_buf; -+ T* qkv_buf_2 = q_buf_2; -+ T* qkv_buf_3 = qk_buf; + int gemm_dims[] = { + 3 * (int)param->hidden_size, (int)param->batch_size * (int)param->src_seq_len, (int)param->hidden_size}; + int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size}; @@ -6637,10 +7957,6 @@ index 0000000..b106b55 + param->stream); + } + gemm_ops[0] = CUBLAS_OP_T; -+ gemm_ops[1] = CUBLAS_OP_N; -+ gemm_dims[0] = param->tgt_seq_len; -+ gemm_dims[1] = param->src_seq_len; -+ gemm_dims[2] = param->head_size; + + gemm_lds[0] = param->head_size; + gemm_lds[1] = param->head_size; @@ -6650,6 +7966,10 @@ index 0000000..b106b55 + (int)(param->src_seq_len * param->head_size), + (int)(param->src_seq_len * param->tgt_seq_len)}; + ++ gemm_dims[0] = param->tgt_seq_len; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->head_size; ++ + CublasGemmStridedBatchedWrapper(output1, + q_buf_2, + qk_buf, @@ -6679,6 +7999,7 @@ index 0000000..b106b55 + param->head_num, + scalar, + param->stream); ++ + gemm_ops[0] = CUBLAS_OP_N; + gemm_ops[1] = CUBLAS_OP_N; + gemm_dims[0] = param->head_size; @@ -6733,6 +8054,7 @@ index 0000000..b106b55 + &beta, + param->cublas_handle, + param->algo); ++ + if (param->projection_bias) { + int len = param->batch_size * param->src_seq_len; + invokeAddBias( @@ -6755,10 +8077,10 @@ index 0000000..b106b55 +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/encoder_layers/encoder.h b/src/fastertransformer/layers/encoder_layers/encoder.h new file mode 100644 -index 0000000..8a82c28 +index 0000000..0caaed1 --- /dev/null +++ b/src/fastertransformer/layers/encoder_layers/encoder.h -@@ -0,0 +1,47 @@ +@@ -0,0 +1,48 @@ +#pragma once + +#include "src/fastertransformer/kernels/activation_kernels.h" @@ -6793,7 +8115,8 @@ index 0000000..8a82c28 + bool layernorm_post; // dont care + int *padding_offset; +} encoderParamT; -+ ++void CublasGemmWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const cudaDataType* data_types, void* alpha, void* beta, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); ++void CublasGemmStridedBatchedWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const int* strides, const cudaDataType* data_types, void* alpha, void* beta, int batch, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); +template +size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); + diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config old mode 100644 new mode 100755 index dfedd52066a..bcd7688bb3e --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[1,512];token_type_ids:[1,512];input_mask:[1,512] +input_shape=input_ids:[1,512];token_type_ids:[1,512];input_mask:[1,512] \ No newline at end of file diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index e08f9803b80..d82d9a7b26c 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -21,7 +21,7 @@ model=$(echo ${model}| sed 's/_fwd//') model=$(echo ${model}| sed 's/_graph//') batch_size=$(echo ${model}| sed 's/bert//') echo "model=${model}" -model_name=$(echo ${model}) + model_name=$(echo ${model}) if [[ "$batch_size" != "${model}" ]];then model_name='bert' fi diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 6f49b2dbc81..f80c7c47d93 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -91,7 +91,7 @@ for line_model_arg in models_arg: if model_name=="bert": if batch_size!='1': model_name+=batch_size - # os.system(f"rm -rf {base}/trc/transformer/{model_name}* {base}/trc/transformer/convv_{model_name}*") + os.system(f"rm -rf {base}/trc/transformer/{model_name}* {base}/trc/transformer/convv_{model_name}*") ret = os.system(f"docker run --user \"$(id -u):$(id -g)\" -w {base}/trc/transformer --runtime=nvidia -v {base}/../:{base}/../ -v /opt/share:/opt/share --privileged=true {image} python {base}/trc/transformer/train_transformer_export.py {line_model_arg} " ) ret=0 if ret != 0: exit() @@ -125,7 +125,7 @@ for line_model_arg in models_arg: os.system(f"ssh {server} 'cd {system}/.. && tar -xzf {system}/../mindspore-lite-{version}-linux-x64.tar.gz'") os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") - #os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) + os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) elif app=='trc': #if loop count =1 app=be else app = runtime diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 652610433db..dfbfe3a50c4 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -11,15 +11,15 @@ #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 -#-b 1 -l 24 -H 16 -S 1024 -s 512 -P 1 -m bert -#-b 8 -l 24 -H 16 -S 1024 -s 512 -P 1 -m bert -#-b 16 -l 24 -H 16 -S 1024 -s 512 -P 1 -m bert -#-b 32 -l 24 -H 16 -S 1024 -s 512 -P 1 -m bert +#-b 1 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert +#-b 8 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert +#-b 16 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert +#-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 624d6fd2feb..7d4d1ef2c54 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -54,8 +54,6 @@ M.context.set_context(mode=M.context.GRAPH_MODE,device_target="GPU", save_graphs # # y = model(encoder_input_value, encoder_input_mask)# _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask) # for i in range (2): # y = model(encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask, memory_mask) - -# # print("y=", y) # export(model, encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask, memory_mask, file_name= name + "_fwd", file_format='MINDIR') def get_gpu_memory(): @@ -406,25 +404,25 @@ def transformer_decoder_layer_t5_create(): suffix = suffix[-2:] print('qt2=',qt2[0]) - saveT(gl1, name + "_weight1.fp" + suffix) - saveT(bl1, name + "_weight2.fp" + suffix) - saveT(wt, name + "_weight3.fp" + suffix) - saveT(bt, name + "_weight4.fp" + suffix) - saveT(wp, name + "_weight5.fp" + suffix) - saveT(bp, name + "_weight6.fp" + suffix) - saveT(gl2, name + "_weight7.fp" + suffix) - saveT(bl2, name + "_weight8.fp" + suffix) - saveT(qt2, name + "_weight9.fp" + suffix) - saveT(wt2, name + "_weight10.fp" + suffix) - saveT(bt2, name + "_weight11.fp" + suffix) - saveT(wp2, name + "_weight12.fp" + suffix) - saveT(bp2, name + "_weight13.fp" + suffix) - saveT(gl3, name + "_weight14.fp" + suffix) - saveT(bl3, name + "_weight15.fp" + suffix) - saveT(omw, name + "_weight16.fp" + suffix) - saveT(omb, name + "_weight17.fp" + suffix) - saveT(opw, name + "_weight18.fp" + suffix) - saveT(opb, name + "_weight19.fp" + suffix) + # saveT(gl1, name + "_weight1.fp" + suffix) + # saveT(bl1, name + "_weight2.fp" + suffix) + # saveT(wt, name + "_weight3.fp" + suffix) + # saveT(bt, name + "_weight4.fp" + suffix) + # saveT(wp, name + "_weight5.fp" + suffix) + # saveT(bp, name + "_weight6.fp" + suffix) + # saveT(gl2, name + "_weight7.fp" + suffix) + # saveT(bl2, name + "_weight8.fp" + suffix) + # saveT(qt2, name + "_weight9.fp" + suffix) + # saveT(wt2, name + "_weight10.fp" + suffix) + # saveT(bt2, name + "_weight11.fp" + suffix) + # saveT(wp2, name + "_weight12.fp" + suffix) + # saveT(bp2, name + "_weight13.fp" + suffix) + # saveT(gl3, name + "_weight14.fp" + suffix) + # saveT(bl3, name + "_weight15.fp" + suffix) + # saveT(omw, name + "_weight16.fp" + suffix) + # saveT(omb, name + "_weight17.fp" + suffix) + # saveT(opw, name + "_weight18.fp" + suffix) + # saveT(opb, name + "_weight19.fp" + suffix) suffix = str(compute_type) suffix = suffix[-2:] @@ -636,7 +634,6 @@ def test_multihead_attention(): def saveT(t,file): x = t.asnumpy() - print('x=',x) x.tofile(file) def saveTensorToHalf(t,file): -- Gitee From 146b4fa014aefdb2d9a6e762d0e3056c96f976c1 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Mon, 2 Jan 2023 10:54:06 +0200 Subject: [PATCH 08/39] update the params in decoder_tensorrt --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 34 +++++++++++++++---- trc/transformer/t.config | 3 +- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 2e65299c484..17e382435af 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -104,23 +104,35 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { memset_s(¶ms, sizeof(params), 0, sizeof(params)); params.head_num = decoder_op->get_head_num(); params.head_size = decoder_op->get_head_size(); + params.hidden_size = params.head_num * params.head_size; params.layernorm_post = decoder_op->get_post_layernorm(); params.eps1 = decoder_op->get_eps_layernorm1(); params.eps2 = decoder_op->get_eps_layernorm2(); params.eps3 = decoder_op->get_eps_layernorm3(); params.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); params.ffn_fp16 = is_ffn_fp16_; - params.attn1.position_bias = decoder_op->get_position_bias1(); - params.attn2.position_bias = decoder_op->get_position_bias2(); params.cublas_handle=GetCublasHandle(); + params.projection_bias = true; + + + params.attn1.head_num = params.head_num; + params.attn1.head_size = params.head_size; + params.attn1.hidden_size = params.hidden_size; + params.attn1.position_bias = decoder_op->get_position_bias1(); params.attn1.qkv_bias = !params.attn1.position_bias; - params.attn2.qkv_bias = !params.attn2.position_bias; params.attn1.projection_bias = !params.attn1.position_bias; - params.attn2.projection_bias = !params.attn2.position_bias; params.attn1.is_cross = false; + params.attn1.cublas_handle=GetCublasHandle(); + + params.attn2.head_num = params.head_num; + params.attn2.head_size = params.head_size; + params.attn2.hidden_size = params.hidden_size; + params.attn2.position_bias = decoder_op->get_position_bias2(); + params.attn2.qkv_bias = !params.attn2.position_bias; + params.attn2.projection_bias = !params.attn2.position_bias; params.attn2.is_cross = true; + params.attn2.cublas_handle=GetCublasHandle(); - params.hidden_size = params.head_num * params.head_size; auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = C15NUM; @@ -179,8 +191,12 @@ template int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - params_.stream = stream; + params_.stream = stream; params_.algo = algoId; + params_.attn1.stream = stream; + params_.attn1.algo = algoId; + params_.attn2.stream = stream; + params_.attn2.algo = algoId; void *inputs_forward[] = { const_cast(inputs[0]), const_cast(inputs[1]), const_cast(inputs[2]), const_cast(inputs[3]), const_cast(inputs[4]), const_cast(inputs[5]), @@ -213,6 +229,12 @@ void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, params_.batch_size = request_batch_size; params_.src_seq_len = request_src_seq_len; params_.tgt_seq_len = request_tgt_seq_len; + params_.attn1.batch_size = request_batch_size; + params_.attn1.src_seq_len = request_src_seq_len; + params_.attn1.tgt_seq_len = request_tgt_seq_len; + params_.attn2.batch_size = request_batch_size; + params_.attn2.src_seq_len = request_src_seq_len; + params_.attn2.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 4c65baa6220..f26391171a8 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,4 +1,3 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" -#fusion_blacklists="EncoderLayerFusion -#fusion_blacklists="DecoderLayerFusion" +fusion_blacklists="EncoderLayerFusion", "DecoderLayerFusion" -- Gitee From efada0e404ad7816566d80a84989f30abf5383bb Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Mon, 2 Jan 2023 10:56:29 +0200 Subject: [PATCH 09/39] for merge --- .../core/load_mindir/anf_model_parser.cc | 20 +- .../delegate/tensorrt/op/decoder_tensorrt.cc | 285 ++++++++++++++++++ .../delegate/tensorrt/op/decoder_tensorrt.h | 107 +++++++ .../delegate/tensorrt/op/encoder_tensorrt.h | 5 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 31 +- trc/transformer/ftBench.py | 4 +- trc/transformer/models.txt | 7 +- trc/transformer/t.config | 3 +- trc/transformer/train_transformer_export.py | 19 -- 10 files changed, 444 insertions(+), 39 deletions(-) mode change 100644 => 100755 mindspore/core/load_mindir/anf_model_parser.cc create mode 100755 mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc create mode 100644 mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h mode change 100644 => 100755 mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h diff --git a/mindspore/core/load_mindir/anf_model_parser.cc b/mindspore/core/load_mindir/anf_model_parser.cc old mode 100644 new mode 100755 index 833a3bf74d1..65fb44d1f6d --- a/mindspore/core/load_mindir/anf_model_parser.cc +++ b/mindspore/core/load_mindir/anf_model_parser.cc @@ -25,6 +25,15 @@ #include #include #include +#ifdef __has_include +#if __has_include() +#include +namespace fs = std :: filesystem ; +#else +#include +namespace fs = std :: experimental :: filesystem ; +#endif +#endif #include "ir/tensor.h" #include "ir/param_info.h" #include "ops/primitive_c.h" @@ -508,7 +517,16 @@ bool MSANFModelParser::GetTensorDataFromExternal(const mind_ir::TensorProto &ten if (it != tenor_data_.end()) { data = it->second.get(); } else { - std::string file = mindir_path_ + "/" + tensor_proto.external_data().location(); + fs :: path path { mindir_path_ }; + std::string convv = "convv"; + std::string file; + if (mindir_path_.find(convv) != std::string::npos) { + file = path.root_directory ( ) . string ( ) + path.stem ( ). string ( ) + "_variables" + "/" + + tensor_proto . external_data (). location (); + } else { + + file = mindir_path_ + "/" + tensor_proto.external_data().location(); + } if (mindir_dec_key_ != nullptr) { size_t plain_len; auto plain_data = Decrypt(&plain_len, file, mindir_dec_key_, mindir_key_size_, mindir_dec_mode_); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc new file mode 100755 index 00000000000..5d49aca8cf0 --- /dev/null +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -0,0 +1,285 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h" +#include +#include +#include +#include +#include +#include +#include +#include "src/extendrt/delegate/tensorrt/tensorrt_utils.h" +#include "NvInferRuntimeCommon.h" +#include "ops/decoder_layer.h" +#include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include "src/fastertransformer/kernels/activation_kernels.h" +#include "src/fastertransformer/utils/cuda_utils.h" +#include "src/fastertransformer/utils/allocator.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" + +namespace mindspore::lite { +template +static void printTensor(char *str, T *input, int size) { + printf("%s ", str); + T *input_device = input; + T *input_host = (T *)malloc(size * sizeof(T)); + + fastertransformer::cudaD2Hcpy(input_host, input_device, size); + + for (int k = 0; k < (int)size; k++) { + std::cout << input_host[k] << ","; + if (k % 10 == 0 && k != 0) std::cout << std::endl; + } + + std::cout << std::endl; + + free(input_host); +} + +namespace { +constexpr std::size_t kTwo = 2; +constexpr std::size_t kThree = 3; + +} // namespace + +// Multi Head Attention TensorRT op +int DecoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (in_tensors.size() != C23NUM) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + // if (out_tensors.size() != 1) { + // MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + // return RET_ERROR; + // } + return RET_OK; +} +nvinfer1::ITensor *DecoderTensorRT::castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, + const std::string &op_name) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor"; + return nullptr; + } + nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape()); + if (dims.nbDims == -1) { + MS_LOG(INFO) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar."; + dims.nbDims = 1; + dims.d[0] = 1; + } + nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType()); + if (!ms_tensor.IsConst()) { + MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name(); + return nullptr; + } + nvinfer1::Weights weights{data_type, ms_tensor.Data(), ms_tensor.ElementNum()}; + if (data_type == nvinfer1::DataType::kFLOAT && is_ffn_fp16_) { + void *data_float16 = malloc(ms_tensor.ElementNum() * sizeof(float)); + if (data_float16 == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return nullptr; + } + auto src = static_cast(ms_tensor.Data()); + auto dst = static_cast(data_float16); + for (int i = 0; i < ms_tensor.ElementNum(); i++) { + dst[i] = static_cast(src[i]); + } + weights.values = data_float16; + } + nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); + if (constant_tensor == nullptr) { + MS_LOG(ERROR) << "create constant_tensor failed."; + return nullptr; + } + ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name); + auto tensor_ptr = constant_tensor->getOutput(0); + return tensor_ptr; +} + +int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + auto decoder_op = AsOps(); + if (decoder_op == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + fastertransformer::decoderParamT params; + memset_s(¶ms, sizeof(params), 0, sizeof(params)); + params.head_num = decoder_op->get_head_num(); + params.head_size = decoder_op->get_head_size(); + params.layernorm_post = decoder_op->get_post_layernorm(); + params.eps1 = decoder_op->get_eps_layernorm1(); + params.eps2 = decoder_op->get_eps_layernorm2(); + params.eps3 = decoder_op->get_eps_layernorm3(); + params.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); + // params.is_cross1 = false; + // params.is_cross2 = true; + params.ffn_fp16 = is_ffn_fp16_; + // params.position_bias1 = decoder_op->get_position_bias1(); + // params.position_bias2 = decoder_op->get_position_bias2(); + params.cublas_handle=GetCublasHandle(); + // params.qkv_bias = !params.position_bias; + // params.projection_bias = !params.position_bias; + params.hidden_size = params.head_num * params.head_size; + auto compute_type = runtime_->GetRuntimePrecisionMode(); + if (is_ffn_fp16_) { + size_t start_fp16 = (params.layernorm_post) ? C7NUM : C18NUM; + size_t end_fp16 = (params.layernorm_post) ? C11NUM : C22NUM; + for (size_t i = 0; i < in_tensors_.size(); i++) { + auto in_tensor = input(ctx, i); + if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { + if (i > start_fp16 && i < end_fp16) { + in_tensor.trt_tensor_ = castTensor(ctx, in_tensors_[i], op_name_); + ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); + } else { + in_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx, in_tensors_[i], op_name_); + ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); + } + } + } + } + nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; + auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + const int input_number = inputs().size(); + nvinfer1::ITensor *inputTensors[input_number]; + for (int i = 0; i < input_number; i++) { + inputTensors[i] = input(ctx, i).trt_tensor_; + } + nvinfer1::IPluginV2Layer *decoder_layer = ctx->network()->addPluginV2(inputTensors, input_number, *plugin); + if (decoder_layer == nullptr) { + MS_LOG(ERROR) << "add decoder op failed for TensorRT."; + return RET_ERROR; + } + decoder_layer->setName((op_name_ + "plugin_decoder_layer").c_str()); + nvinfer1::ITensor *decoder_tensor = decoder_layer->getOutput(0); + ctx->RegisterTensor(ITensorHelper{decoder_tensor, Format::NCHW, true}, out_tensors_[0].Name()); + this->layer_ = decoder_layer; + return RET_OK; +} + +REGISTER_TENSORRT_PLUGIN(DecoderPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int DecoderPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + if (compute_type_ == RuntimePrecisionMode_FP16) { + return RunCudaDecoder(inputDesc, outputDesc, inputs, outputs, workspace, stream, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + } else { + return RunCudaDecoder(inputDesc, outputDesc, inputs, outputs, workspace, stream, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); + } +} + +template +int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, + void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { + params_.stream = stream; + params_.algo = algoId; + void* inputs_forward[] = { + (void*)(inputs[0]), (void*)(inputs[1]), + (void*)(inputs[2]), (void*)(inputs[3]), + (void*)(inputs[4]), (void*)(inputs[5]), + (void*)(inputs[6]), (void*)(inputs[7]), + (void*)(inputs[8]), (void*)(inputs[9]), + (void*)(inputs[10]), (void*)(inputs[11]), + (void*)(inputs[12]), (void*)(inputs[13]), + (void*)(inputs[14]), (void*)(inputs[15]), + (void*)(inputs[16]), (void*)(inputs[17]), + (void*)(inputs[19]), (void*)(inputs[20]), + (void*)(inputs[21]), (void*)(inputs[22]) + }; + void* outputs_forward[] = { outputs[0] }; + fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, workspace); + return RET_OK; +} + +bool DecoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, + int nbOutputs) noexcept { + auto type = (compute_type_ == RuntimePrecisionMode_FP16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; + for (int i = 0; i < pos; i++) { + if (tensorsDesc[pos].type != tensorsDesc[i].type) return false; + } + bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type); + return res; +} +void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { + const int request_batch_size = static_cast(in[0].desc.dims.d[0]); + const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); + const int request_tgt_seq_len = request_src_seq_len; + params_.batch_size = request_batch_size; + params_.src_seq_len = request_src_seq_len; + params_.tgt_seq_len = request_tgt_seq_len; + num_of_inputs_ = nbInputs; + num_of_outputs_ = nbOutputs; +} +size_t DecoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { + + if (compute_type_ == RuntimePrecisionMode_FP16) { + return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); + } else { + return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); + } +} + +nvinfer1::DimsExprs DecoderPlugin::getOutputDimensions(int32_t index, const nvinfer1::DimsExprs *inputs, + int nbInputDims, nvinfer1::IExprBuilder &exprBuilder) noexcept { + nvinfer1::DimsExprs dims; + if (index == 0) { + int num_dims = inputs[0].nbDims; + dims.nbDims = num_dims; + if (num_dims == INPUT_SIZE2) { + dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); + dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); + } else if (num_dims == INPUT_SIZE3) { + dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); + dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); + dims.d[2] = exprBuilder.constant(inputs[0].d[2]->getConstantValue()); + } + } + return dims; +} + +nvinfer1::IPluginV2DynamicExt *DecoderPlugin::clone() const noexcept { + auto *plugin = new DecoderPlugin(*this); // TODO(haim) CopyConstructor + if (plugin == nullptr) { + MS_LOG(ERROR) << "plugin is null"; + return nullptr; + } + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +size_t DecoderPlugin::getSerializationSize() const noexcept { return sizeof(int) + sizeof(fastertransformer::decoderParamT); } + +void DecoderPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &compute_type_, sizeof(int)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::decoderParamT)); +} +REGISTER_TENSORRT_CREATOR(ops::kNameDecoderLayer, DecoderTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h new file mode 100644 index 00000000000..2bd33e41ca7 --- /dev/null +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -0,0 +1,107 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_ + +#include +#include +#include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" +#include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" +#include "src/fastertransformer/layers/decoder_layers/decoder.h" +namespace mindspore::lite { +class DecoderTensorRT : public TensorRTOp { + public: + DecoderTensorRT(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, + const std::vector &out_tensors, std::string name) + : TensorRTOp(base_operator, in_tensors, out_tensors, name) {} + + ~DecoderTensorRT() override = default; + bool IsWeightInputHanledInner() const override { return is_ffn_fp16_; } + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); + bool is_ffn_fp16_ = true; +}; + +constexpr auto ENCODER_PLUGIN_NAME{"DecoderPlugin"}; +class DecoderPlugin : public TensorRTPlugin { + public: + DecoderPlugin(const std::string name, int compute_type, fastertransformer::decoderParamT params, + cublasLtHandle_t cublaslt_handle, uint32_t device_id) + : TensorRTPlugin(name, std::string(ENCODER_PLUGIN_NAME), device_id), + compute_type_(compute_type), + params_(params), + cublaslt_handle_(cublaslt_handle) + {} + + DecoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + compute_type_ = static_cast(fields[0].data)[0]; + params_ = static_cast(fields[1].data)[0]; + cublaslt_handle_ = static_cast(fields[2].data)[0]; + } + + DecoderPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::decoderParamT)); + } + + DecoderPlugin() = delete; + + ~DecoderPlugin() override {} + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs *inputs, int nbInputDims, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override; + bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, + int nbOutputs) noexcept override; + + private: + const std::string layer_name_; + std::string name_space_; + int compute_type_; + mutable fastertransformer::decoderParamT params_; + cublasLtHandle_t cublaslt_handle_; + int num_of_inputs_; + int num_of_outputs_; + + template + int RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, + cublasGemmAlgo_t algoId); +}; +class DecoderPluginCreater : public TensorRTPluginCreater { + public: + DecoderPluginCreater() : TensorRTPluginCreater(std::string(ENCODER_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_ENCODER_TENSORRT_H_ diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h old mode 100644 new mode 100755 index fa655100b0b..94a83a95f34 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -50,8 +50,7 @@ class EncoderPlugin : public TensorRTPlugin { : TensorRTPlugin(name, std::string(ENCODER_PLUGIN_NAME), device_id), compute_type_(compute_type), params_(params), - cublaslt_handle_(cublaslt_handle) - {} + cublaslt_handle_(cublaslt_handle) {} EncoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { @@ -93,7 +92,7 @@ class EncoderPlugin : public TensorRTPlugin { cublasLtHandle_t cublaslt_handle_; int num_of_inputs_; int num_of_outputs_; - + template int RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index cc543ad3d77..0138e0d6bbb 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_decoder_layer,128];token_type_ids:[transformer_decoder_layer,128];input_mask:[transformer_decoder_layer,128] +input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128] \ No newline at end of file diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index d2a104eb521..1ff776be553 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -4,15 +4,24 @@ version=$(cat ${base}/version.txt) system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 benchmark=${system}/tools/benchmark/benchmark server=caspi -gpu_id=2 - +gpu_id=3 +while getopts "c" opt ; do + case "${opt}" in + c) + compress="_compress" ;; + *) + echo "Unknown option ${opt}!" ;; + esac + done +shift $(($OPTIND - 1)) # move files to caspi model=${1%.mindir} model=${model#convv_} model=$(echo ${model}| sed 's/_fwd//') +model=$(echo ${model}| sed 's/_graph//') batch_size=$(echo ${model}| sed 's/bert//') echo "model=${model}" -model_name=$(echo ${model}) + model_name=$(echo ${model}) if [[ "$batch_size" != "${model}" ]];then model_name='bert' fi @@ -22,9 +31,8 @@ then fi echo "batch_size=${batch_size}" echo "model_name=${model_name}" - -dir=$(dirname $(realpath $1)) -ssh ${server} "mkdir -p ${dir}" +dir1=$(dirname $(realpath $1)) +ssh ${server} "mkdir -p ${dir1}" dir=$(dirname ${benchmark}) ssh ${server} "mkdir -p ${dir}" dir=${system}/runtime/lib @@ -35,12 +43,13 @@ rsync -v $1 ${server}:$(realpath $1) rsync -v ${benchmark} ${server}:${benchmark} rsync -vl ${system}/runtime/lib/* ${server}:${system}/runtime/lib/ rsync -vl ${system}/tools/converter/lib/* ${server}:${system}/tools/converter/lib/ -echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_bert.config -rsync -v cfg_${model_name}.config ${server}:$(realpath "cfg_${model_name}.config") +# rsync -vr ${dir1}/convv_${model}_fwd_graph_variables ${server}:${dir1} +echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_{model}.config +rsync -v cfg_${model}.config ${server}:$(realpath "cfg_${model}.config") # this should be more general ! # output_files=$(find . -maxdepth 1 -name ${model}_compress_output"*.txt*" | sort -n) -output_files=$(find . -maxdepth 1 -name ${model}_output"*.txt*" | sort -n) +output_files=$(find . -maxdepth 1 -name ${model}${compress}_output"*.txt*" | sort -n) input_files=$(find . -maxdepth 1 -name ${model}_input"*.fp32" | sort -n) rsync -v ${input_files} ${output_files} ${server}:${PWD} @@ -56,8 +65,8 @@ then command+="--inDataFile=\"${input_files}\"" command+=" --benchmarkDataFile=\"${output_files}\" " fi -if [ -f cfg_${model_name}.config ]; then - command+="--configFile=cfg_${model_name}.config " +if [ -f cfg_${model}.config ]; then + command+="--configFile=cfg_${model}.config " fi command+="--device=GPU " #command+="--enableFp16=true" diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 4a0886ca9c1..96cf5fa5db4 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -145,9 +145,9 @@ for line_model_arg in models_arg: else: print("run trc caspi") print("line model arg=", line_model_arg) - os.system(f"ssh {server} 'rm -f {base}/../FasterTransformer/build/bin/ms_benchmark {base}/../FasterTransformer/build/bin/{model_name}*'") + os.system(f"ssh {server} 'rm -f {base}/../FasterTransformer/build/bin/ms_benchmark {base}/../FasterTransformer/build/bin/*{model_name}*'") os.system(f"rsync -v {base}/../FasterTransformer/build/bin/ms_benchmark {server}:{base}/../FasterTransformer/build/bin/ms_benchmark" ) - os.system(f"rsync -v {base}/trc/transformer/{model_name}* {server}:{base}/../FasterTransformer/build/bin" ) + os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/../FasterTransformer/build/bin" ) os.system(f'rsync -v {base}/../FasterTransformer/build/lib/libtransformer-shared.so caspi:{base}/../FasterTransformer/build/lib/.') os.system(f"ssh {server} 'cd {base}/../FasterTransformer/build/bin && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={base}/../FasterTransformer:/usr/local/cuda-11.7/lib64 ./ms_benchmark {line_model_arg}' " ) diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index e88dab61233..4295dc5ecae 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -10,7 +10,7 @@ #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 @@ -23,6 +23,7 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -m transformer_encoder_layer #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer_t5 @@ -80,3 +81,7 @@ #-b 8 -l 66 -s 256 -H 8 -S 512 -p 0 -m mha_x1 -T fp16 #-b 8 -l 66 -s 64 -H 8 -S 512 -p 0 -m mha_x1 -T fp16 #-b 1 -l 66 -s 256 -H 12 -S 768 -p 0 -m mha_x1 -T fp16 + +#-b 64 -l 12 -H 12 -S 768 -s 128 -m bert +#-b 64 -l 24 -H 16 -S 1024 -s 128 -m bert +-b 64 -l 24 -H 16 -S 1024 -s 512 -m bert diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 336dbdc28f3..4bd5994c830 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,3 +1,4 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" -fusion_blacklists="EncoderLayerFusion","DecoderLayerFusion" +#fusion_blacklists="EncoderLayerFusion +fusion_blacklists="DecoderLayerFusion" \ No newline at end of file diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 407de65b428..f8563d82814 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -527,30 +527,12 @@ def transformer_decoder_layer_create(): saveT(omb, name + "_weight17.fp" + suffix) saveT(opw, name + "_weight18.fp" + suffix) saveT(opb, name + "_weight19.fp" + suffix) - # # if app == 'trc': - # # saveTensorToHalf(omw, name + "_weight9.fp" + "16") - # # saveTensorToHalf(omb, name + "_weight10.fp" + "16") - # # saveTensorToHalf(opw, name + "_weight11.fp" + "16") - # # elif app == 'ch': - # saveT(omw, name + "_weight9.fp" + suffix) - # saveT(omb, name + "_weight10.fp" + suffix) - # saveT(opw, name + "_weight11.fp" + suffix) - # saveT(opb, name + "_weight12.fp" + suffix) _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask) y = model(hidden_stats, decoder_mask, encoder_output, memory_mask) export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') - # if app=="ch": - print('y=',y) - print(y) f_y=open(f'./{name}_output.txt','w') - # # out_name=get_output_encoder_layer(name + "_fwd.mindir") - # # print("name output:",out_name) saveCalib("output1", np.array(y), f_y)#2 dims - # # print("y.shpae",np.array(y).shape) - # # saveCalib('Default/Add-op267', y, f_y)#2 dims f_y.close() - # # saveCalib('Default/Reshape-op296', np.array(y), f_y)#2 dims - # # elif app=="trc": saveT(y, name + "_output1.fp" + suffix) def build_transformer_encoder_layer_post_ture(): @@ -589,7 +571,6 @@ def test_multihead_attention(): def saveT(t,file): x = t.asnumpy() - print('x=',x) x.tofile(file) def saveTensorToHalf(t,file): -- Gitee From b18b85611bf6ff215aff1f61723b791119405bfd Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Tue, 3 Jan 2023 12:05:51 +0200 Subject: [PATCH 10/39] add ms_layers --- .../kernel/nnacl/infer/decoder_layer_infer.c | 1 - .../plugin/device/cpu/kernel/nnacl/op_base.h | 5 +- .../delegate/tensorrt/op/decoder_tensorrt.cc | 4 +- .../delegate/tensorrt/op/decoder_tensorrt.h | 3 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 21 +- .../delegate/tensorrt/op/encoder_tensorrt.h | 6 +- .../delegate/tensorrt/op/mha_tensorrt.cc | 7 +- .../delegate/tensorrt/op/mha_tensorrt.h | 11 +- .../lite/tools/converter/anf_transform.cc | 3 + .../optimizer/fusion/decoder_layer_fusion.cc | 27 +- .../optimizer/fusion/encoder_layer_fusion.cc | 46 +- .../optimizer/fusion/encoder_layer_fusion.h | 1 + .../001-fast_transformer.patch | 3274 ++++++++++++++--- trc/transformer/MultiHeadTester.py | 1 - trc/transformer/cfg_bert.config | 2 +- trc/transformer/convert_fp32.sh | 6 +- trc/transformer/deploy.sh | 44 +- trc/transformer/ftBench.py | 33 +- trc/transformer/get_output_by_mindir.py | 0 trc/transformer/models.txt | 12 +- trc/transformer/t.config | 3 +- trc/transformer/test_tr.py | 0 trc/transformer/train_transformer_export.py | 237 +- 23 files changed, 2974 insertions(+), 773 deletions(-) mode change 100644 => 100755 trc/transformer/MultiHeadTester.py mode change 100644 => 100755 trc/transformer/get_output_by_mindir.py mode change 100644 => 100755 trc/transformer/test_tr.py diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c index f2f9ac344fe..24336c769da 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c @@ -20,7 +20,6 @@ int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size, OpParameter *parameter) { - printf("DecoderLayerInferShape\n" ); int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, C23NUM, C1NUM); if (check_ret != NNACL_OK) { return check_ret; diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h index 43f9f2c55fa..f05d49be28b 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h @@ -529,8 +529,9 @@ enum PrimType { PrimType_GroupNormFusion = 211, PrimType_Log1p = 212, PrimType_TensorScatterAdd = 213, - PrimType_EncoderLayer = 214, - PrimType_DecoderLayer = 215, + PrimType_ScatterElements = 214, + PrimType_EncoderLayer = 215, + PrimType_DecoderLayer = 216, PrimType_MIN = PrimType_NONE, PrimType_MAX = PrimType_DecoderLayer + 1, diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 17e382435af..3dc2c8aa38f 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -120,7 +120,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn1.hidden_size = params.hidden_size; params.attn1.position_bias = decoder_op->get_position_bias1(); params.attn1.qkv_bias = !params.attn1.position_bias; - params.attn1.projection_bias = !params.attn1.position_bias; + params.attn1.projection_bias = false; params.attn1.is_cross = false; params.attn1.cublas_handle=GetCublasHandle(); @@ -129,7 +129,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.hidden_size = params.hidden_size; params.attn2.position_bias = decoder_op->get_position_bias2(); params.attn2.qkv_bias = !params.attn2.position_bias; - params.attn2.projection_bias = !params.attn2.position_bias; + params.attn2.projection_bias = false; params.attn2.is_cross = true; params.attn2.cublas_handle=GetCublasHandle(); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index 9b1f5456193..a7006edad2d 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -22,7 +22,8 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" -#include "src/fastertransformer/layers/decoder_layers/decoder.h" +#include "src/fastertransformer/layers/ms_layers/decoder.h" +#include "src/fastertransformer/layers/ms_layers/param.h" #include "src/extendrt/delegate/tensorrt/tensorrt_utils.h" namespace mindspore::lite { class DecoderTensorRT : public TensorRTOp { diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 5c3664b9465..98d79cf9e2f 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -41,7 +41,7 @@ constexpr std::size_t kThree = 3; // Multi Head Attention TensorRT op int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { - if (in_tensors.size() != C14NUM) { + if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; } @@ -109,13 +109,19 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.eps1 = encoder_op->get_eps_layernorm1(); params.eps2 = encoder_op->get_eps_layernorm2(); params.ffn_hidden_size = encoder_op->get_ffn_hidden_size(); - params.is_cross = false; params.ffn_fp16 = is_ffn_fp16_; - params.position_bias = encoder_op->get_position_bias(); params.cublas_handle = GetCublasHandle(); - params.qkv_bias = !params.position_bias; - params.projection_bias = !params.position_bias; + params.projection_bias = !encoder_op->get_position_bias(); params.hidden_size = params.head_num * params.head_size; + + params.attn.head_num = encoder_op->get_head_num(); + params.attn.head_size = encoder_op->get_head_size(); + params.attn.cublas_handle = GetCublasHandle(); + params.attn.projection_bias = false; + params.attn.hidden_size = params.head_num * params.head_size; + params.attn.is_cross = false; + params.attn.position_bias = encoder_op->get_position_bias(); + params.attn.qkv_bias = !params.attn.position_bias; auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; @@ -178,6 +184,8 @@ int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { params_.stream = stream; params_.algo = algoId; + params_.attn.stream = stream; + params_.attn.algo = algoId; void *inputs_forward[] = { const_cast(inputs[0]), const_cast(inputs[1]), const_cast(inputs[2]), const_cast(inputs[3]), const_cast(inputs[4]), const_cast(inputs[5]), @@ -207,6 +215,9 @@ void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, params_.batch_size = request_batch_size; params_.src_seq_len = request_src_seq_len; params_.tgt_seq_len = request_tgt_seq_len; + params_.attn.batch_size = request_batch_size; + params_.attn.src_seq_len = request_src_seq_len; + params_.attn.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index fd9b334021e..031026a98b1 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -22,7 +22,9 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" -#include "src/fastertransformer/layers/encoder_layers/encoder.h" +#include "src/fastertransformer/layers/ms_layers/encoder.h" +#include "src/fastertransformer/layers/ms_layers/param.h" + namespace mindspore::lite { class EncoderTensorRT : public TensorRTOp { public: @@ -40,7 +42,7 @@ class EncoderTensorRT : public TensorRTOp { private: nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); - bool is_ffn_fp16_ = false; + bool is_ffn_fp16_ = true; }; constexpr auto ENCODER_PLUGIN_NAME{"EncoderPlugin"}; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index f5bcb1ab8cb..91e0e1c1bee 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -71,7 +71,7 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { bool is_cross = mha_op->get_cross(); bool is_position_bias = mha_op->get_position_bias(); nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; - fastertransformer::encoderParamT params; + fastertransformer::attentionParamT params; memset_s(¶ms, sizeof(params), 0, sizeof(params)); params.head_num = head_number; params.head_size = head_size; @@ -256,7 +256,6 @@ nvinfer1::DimsExprs MhaPlugin::getOutputDimensions(int32_t index, const nvinfer1 dims.d[0] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; auto hidden_size = exprBuilder.constant(head_size_ * head_number_); dims.d[1] = hidden_size; - } #endif return dims; } @@ -276,12 +275,12 @@ int MhaPlugin::initialize() noexcept { return 0; } void MhaPlugin::terminate() noexcept {} size_t MhaPlugin::getSerializationSize() const noexcept { - return sizeof(int) + sizeof(fastertransformer::encoderParamT); + return sizeof(int) + sizeof(fastertransformer::attentionParamT); } void MhaPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); - SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::encoderParamT)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::attentionParamT)); } REGISTER_TENSORRT_CREATOR(ops::kNameAttention, MhaTensorRT) } // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h index c539a74c5e9..4d9a21a498b 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h @@ -22,7 +22,8 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" -#include "src/fastertransformer/layers/encoder_layers/encoder.h" +#include "src/fastertransformer/layers/ms_layers/attention.h" +#include "src/fastertransformer/layers/ms_layers/param.h" namespace mindspore::lite { class MhaTensorRT : public TensorRTOp { @@ -43,7 +44,7 @@ class MhaTensorRT : public TensorRTOp { constexpr auto MHA_PLUGIN_NAME{"AttentionPlugin"}; class MhaPlugin : public TensorRTPlugin { public: - MhaPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params, + MhaPlugin(const std::string name, int compute_type, fastertransformer::attentionParamT params, cublasLtHandle_t cublaslt_handle, uint32_t device_id) : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id), compute_type_(compute_type), @@ -54,14 +55,14 @@ class MhaPlugin : public TensorRTPlugin { : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; - params_ = static_cast(fields[1].data)[0]; + params_ = static_cast(fields[1].data)[0]; cublaslt_handle_ = static_cast(fields[2].data)[0]; } MhaPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamT)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::attentionParamT)); } MhaPlugin() = delete; @@ -92,7 +93,7 @@ class MhaPlugin : public TensorRTPlugin { const std::string layer_name_; std::string name_space_; int compute_type_; - mutable fastertransformer::encoderParamT params_; + mutable fastertransformer::attentionParamT params_; cublasLtHandle_t cublaslt_handle_; int num_of_inputs_; int num_of_outputs_; diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc index 98e2808bde0..f3626b1cd9c 100644 --- a/mindspore/lite/tools/converter/anf_transform.cc +++ b/mindspore/lite/tools/converter/anf_transform.cc @@ -323,6 +323,7 @@ int AnfTransform::RunFusionPass(const FuncGraphPtr &old_graph, const std::shared std::make_shared(), std::make_shared(), std::make_shared()}; + #ifdef ENABLE_CLOUD_FUSION_INFERENCE//ENABLE_CLOUD_FUSION_TRANSFORMER_INFERENCE fusions.push_back(std::make_shared()); fusions.push_back(std::make_shared()); @@ -333,6 +334,8 @@ int AnfTransform::RunFusionPass(const FuncGraphPtr &old_graph, const std::shared MS_CHECK_TRUE_RET(pass_ptr != nullptr, RET_ERROR); auto pass_name = pass_ptr->name(); if (param->fusion_blacklists.find(pass_name) != param->fusion_blacklists.end()) { + std::cout<< "Disable fusion: " << pass_name; + MS_LOG(INFO) << "Disable fusion: " << pass_name; continue; } diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 6c61f05bb44..6312b579f39 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -144,9 +144,6 @@ VectorRef DecoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, Var VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, bool is_position_bias = false) const { - std::cout << "DefinePatternDecoderLayer post=" << post_layernorm << " layernorm_fusion=" << layernorm_fusion - << std::endl; - std::cout << "attention no position bias" << std::endl; auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); auto var1 = std::make_shared("var1-reshape"); @@ -260,7 +257,6 @@ std::unordered_map DecoderLayerFusion::DefinePatterns() } patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, false, false); patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, false, false); - // std::cout << "patterns[kPatternDecoderLayer]" << patterns[kPatternDecoderLayer].ToString() << std::endl; return patterns; } @@ -269,7 +265,7 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - std::cout << "found pattern " << pattern_name << std::endl; + std::cout<<"found pattern = "<< pattern_name < DecoderLayerFusion::CreatePrim(const FuncGrap CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const AnfNodePtr &node, bool post_layernorm = true) const { - std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); // bool is_position_bias = false; auto input = utils::cast((*equiv)[hidden_stats_]); MS_ASSERT(input != nullptr); - std::cout << "input" << std::endl; auto encoder_output = utils::cast((*equiv)[encoder_output_]); MS_ASSERT(encoder_output != nullptr); - std::cout << "encoder_output" << std::endl; AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3; auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); MS_ASSERT(weight_qkv != nullptr); bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); MS_ASSERT(weight_qkv != nullptr); - std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); MS_ASSERT(weight_attn_o != nullptr); - std::cout << "weight_attn_o" << std::endl; auto weight_attn_q = utils::cast((*equiv)[weight_attn_q_]); MS_ASSERT(weight_attn_q != nullptr); auto weight_attn_kv = utils::cast((*equiv)[weight_attn_kv_]); MS_ASSERT(weight_attn_kv != nullptr); auto weight_attn_cross_o = utils::cast((*equiv)[weight_attn_cross_o_]); MS_ASSERT(weight_attn_cross_o != nullptr); - std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; auto weight_m = utils::cast((*equiv)[weight_m_]); MS_ASSERT(weight_m != nullptr); - std::cout << "weight_m" << std::endl; auto weight_p = utils::cast((*equiv)[weight_p_]); MS_ASSERT(weight_p != nullptr); - std::cout << "weight_p" << std::endl; auto bias_attn_cross_qkv = utils::cast((*equiv)[bias_attn_cross_qkv_]); MS_ASSERT(bias_attn_cross_qkv != nullptr); auto bias_attn_cross_o = utils::cast((*equiv)[bias_attn_cross_o_]); @@ -543,29 +531,20 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP MS_ASSERT(bias_p != nullptr); beta1 = utils::cast((*equiv)[beta1_]); MS_ASSERT(beta1 != nullptr); - std::cout << "beta1" << std::endl; beta2 = utils::cast((*equiv)[beta2_]); MS_ASSERT(beta2 != nullptr); - std::cout << "beta2" << std::endl; beta3 = utils::cast((*equiv)[beta3_]); MS_ASSERT(beta3 != nullptr); - std::cout << "beta3" << std::endl; auto gamma1 = utils::cast((*equiv)[gamma1_]); MS_ASSERT(gamma1 != nullptr); - std::cout << "gamma1" << std::endl; auto gamma2 = utils::cast((*equiv)[gamma2_]); MS_ASSERT(gamma2 != nullptr); - std::cout << "gamma2" << std::endl; auto gamma3 = utils::cast((*equiv)[gamma3_]); MS_ASSERT(gamma3 != nullptr); - std::cout << "gamma3" << std::endl; - input_mask = utils::cast((*equiv)[mask_]); MS_ASSERT(input_mask != nullptr); - std::cout << "input_mask" << std::endl; auto cross_mask = utils::cast((*equiv)[cross_mask_]); MS_ASSERT(cross_mask != nullptr); - std::cout << "input_mask" << std::endl; auto base_shape_ptr = weight_m->Shape(); MS_EXCEPTION_IF_NULL(base_shape_ptr); auto input_shape_ptr = base_shape_ptr->cast(); @@ -573,14 +552,12 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP auto input_shape = input_shape_ptr->shape(); MS_ASSERT(input_shape != nullptr); int ffn_hidden_size = (int64_t)input_shape[1]; - std::cout << ffn_hidden_size << std::endl; auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size); MS_CHECK_TRUE_RET(decoder_layer_prim != nullptr, nullptr); auto decoder_layer_prim_c = decoder_layer_prim->GetPrim(); MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr); auto value_node = NewValueNode(decoder_layer_prim_c); MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); - std::cout << "value_node" << std::endl; std::vector new_node_inputs = {value_node, input, gamma1, @@ -612,8 +589,6 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); new_node->set_abstract(old_node->abstract()->Clone()); new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); - std::cout << new_node->ToString() << std::endl; - return new_node; } } // namespace mindspore::opt \ No newline at end of file diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index c73d1da3d7f..93624420e25 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -83,7 +83,7 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio if (post_layernorm) { return reshape1; } - if (layernorm_fusion) { + if (!layernorm_fusion) { return DefineLayerNorm(is_position_bias, reshape1, gamma1_, beta1_); } auto layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_}); @@ -164,12 +164,12 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add"); auto add = VectorRef({is_add, reshape1, tuple}); if (layernorm_fusion) { - tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_); - } else { auto layer_norm2 = VectorRef({is_layernorm2_, add, gamma2_, beta2_}); auto is_tuple2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item2"); auto var_tuple2 = std::make_shared("var_tuple2"); tuple2 = VectorRef({is_tuple2, layer_norm2, var_tuple2}); + } else { + tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_); } auto is_reshape2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder2"); MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {}); @@ -180,7 +180,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr if (is_position_bias) { reshape2 = VectorRef({is_reshape2, add, var2}); matmul1 = VectorRef({is_matmul1, tuple2, weight_m_}); - } else if (post_layernorm || layernorm_fusion) { + } else if (post_layernorm || !layernorm_fusion) { reshape2 = VectorRef({is_reshape2, tuple2, var2}); matmul1 = VectorRef({is_matmul1, tuple2, weight_m_, bias_m_}); } else { @@ -199,7 +199,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto reshape3 = VectorRef({is_reshape3, matmul2, var3}); auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); auto add3 = VectorRef({is_add3, reshape2, reshape3}); - if (!post_layernorm || layernorm_fusion) { + if (!post_layernorm || !layernorm_fusion) { return add3; } auto is_reshape4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); @@ -208,12 +208,12 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr MS_CHECK_TRUE_RET(var4 != nullptr, {}); auto reshape4 = VectorRef({is_reshape4, add3, var4}); if (layernorm_fusion) { - tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_); - } else { auto layer_norm = VectorRef({is_layernorm1_, reshape4, gamma1_, beta1_}); auto is_tuple3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item3"); auto var_tuple3 = std::make_shared("var_tuple3"); tuple3 = VectorRef({is_tuple3, layer_norm, var_tuple3}); + } else { + tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_); } auto is_reshape5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); MS_CHECK_TRUE_RET(is_reshape5 != nullptr, {}); @@ -233,7 +233,7 @@ std::unordered_map EncoderLayerFusion::DefinePatterns() patterns[kPatternTEncoderLayerPost] = DefinePatternEncoderLayer(true); patterns[kPatternTEncoderLayerPostNorm] = DefinePatternEncoderLayer(true, true); patterns[kPatternTEncoderLayerPreNorm] = DefinePatternEncoderLayer(false, true); - patterns[kPatternEncoderLayerT5] = DefinePatternEncoderLayer(false, true, true); + patterns[kPatternEncoderLayerT5] = DefinePatternEncoderLayer(false, false, true); return patterns; } @@ -242,6 +242,10 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } + std::cout<<"found pattern = "<< pattern_name <GetAttr(ops::kPositionBias) != nullptr) { is_position_bias_ = attn_prim->get_position_bias(); } - auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); - auto layrn1_prim = ops::GetOperator(layrn1_input); - if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) { - *eps1 = layrn1_prim->get_epsilon(); - } - auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_); - auto layrn2_prim = ops::GetOperator(layrn2_input); - if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { - *eps2 = layrn2_prim->get_epsilon(); + if (is_layernorm_fusion_) { + auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); + auto layrn1_prim = ops::GetOperator(layrn1_input); + if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) { + *eps1 = layrn1_prim->get_epsilon(); + } + auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_); + auto layrn2_prim = ops::GetOperator(layrn2_input); + if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { + *eps2 = layrn2_prim->get_epsilon(); + } } - if (!IsActGELU(func_graph, equiv, is_act_)) { - return RET_ERROR; + if (!is_position_bias_) { + if (!IsActGELU(func_graph, equiv, is_act_)) { + return RET_ERROR; + } } return RET_OK; } diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index 4f05e809d31..bef47951ee7 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -83,6 +83,7 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr is_layernorm1_{nullptr}; mutable VarPtr is_layernorm2_{nullptr}; mutable bool is_position_bias_{false}; + mutable bool is_layernorm_fusion_{false}; mutable VarPtr is_act_{nullptr}; }; } // namespace opt diff --git a/third_party/patch/fast_transformer/001-fast_transformer.patch b/third_party/patch/fast_transformer/001-fast_transformer.patch index 020fcefb4e3..a58bdfd0c64 100644 --- a/third_party/patch/fast_transformer/001-fast_transformer.patch +++ b/third_party/patch/fast_transformer/001-fast_transformer.patch @@ -132,7 +132,7 @@ index 8707220..c9369e0 100644 target_link_libraries(trt_fused_multi_head_attention PUBLIC -lcublas -lcudart) set_property(TARGET trt_fused_multi_head_attention PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/CMakeLists.txt b/CMakeLists.txt -index ea21014..be872d9 100644 +index ea21014..e3d61e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,9 @@ @@ -195,16 +195,15 @@ index ea21014..be872d9 100644 ######################################## if(BUILD_MULTI_GPU) -@@ -249,6 +256,8 @@ add_library(transformer-static STATIC +@@ -249,6 +256,7 @@ add_library(transformer-static STATIC $ $ $ -+ $ -+ $ ++ $ $ $ $ -@@ -313,8 +322,9 @@ add_library(transformer-static STATIC +@@ -313,8 +321,9 @@ add_library(transformer-static STATIC set_property(TARGET transformer-static PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET transformer-static PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(transformer-static PUBLIC -lcudart -lnccl -lmpi -lcublas -lcublasLt -lcurand) @@ -215,15 +214,14 @@ index ea21014..be872d9 100644 $ $ $ -@@ -324,29 +334,11 @@ add_library(transformer-shared SHARED +@@ -324,29 +333,10 @@ add_library(transformer-shared SHARED $ $ $ - $ - $ - $ -+ $ -+ $ ++ $ $ - $ $ @@ -247,7 +245,7 @@ index ea21014..be872d9 100644 $ $ $ -@@ -373,9 +365,7 @@ add_library(transformer-shared SHARED +@@ -373,9 +363,7 @@ add_library(transformer-shared SHARED $ $ $ @@ -257,7 +255,7 @@ index ea21014..be872d9 100644 $ $ $ -@@ -387,14 +377,22 @@ add_library(transformer-shared SHARED +@@ -387,14 +375,22 @@ add_library(transformer-shared SHARED $ $ $) @@ -282,7 +280,7 @@ index ea21014..be872d9 100644 include(CMakePackageConfigHelpers) configure_package_config_file( ${CMAKE_CURRENT_LIST_DIR}/cmake/FasterTransformerConfig.cmake.in -@@ -402,52 +400,23 @@ configure_package_config_file( +@@ -402,52 +398,23 @@ configure_package_config_file( INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) @@ -353,7 +351,7 @@ index a60983c..45b5374 100644 diff --git a/deploy.sh b/deploy.sh new file mode 100755 -index 0000000..8300058 +index 0000000..0e60c1a --- /dev/null +++ b/deploy.sh @@ -0,0 +1,27 @@ @@ -373,7 +371,7 @@ index 0000000..8300058 +shift +rsync -v ${file} ${server}:${file} +echo "file=${file}" -+rsync -v ${base}/../mindspore/trc/transformer/*.fp32 ${server}:${base}/build/bin ++rsync -v ${base}/../mindspore/trc/transformer/*.fp* ${server}:${base}/build/bin +rsync -v ${base}/build/lib/*.so ${server}:${base}/build/lib +# echo "cd ${base}/build/bin/" +command=$(cat <<-ENDM @@ -424,7 +422,7 @@ index cacb09e..5fec0c9 100644 else if (std::is_same::value) { diff --git a/examples/cpp/ms/CMakeLists.txt b/examples/cpp/ms/CMakeLists.txt new file mode 100644 -index 0000000..52f9a5e +index 0000000..33e562b --- /dev/null +++ b/examples/cpp/ms/CMakeLists.txt @@ -0,0 +1,22 @@ @@ -445,25 +443,23 @@ index 0000000..52f9a5e +add_executable(ms_benchmark ms.cc) +if (SPARSITY_SUPPORT) +# target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt transformer-shared) -+target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt GptContextAttentionLayer EncoderLayer DecoderLayer) ++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt GptContextAttentionLayer MSLayer) +else() +# target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart transformer-shared) -+target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart GptContextAttentionLayer EncoderLayer DecoderLayer) ++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart GptContextAttentionLayer MSLayer) +endif() diff --git a/examples/cpp/ms/initialize.h b/examples/cpp/ms/initialize.h new file mode 100644 -index 0000000..db057ad +index 0000000..06ec2b2 --- /dev/null +++ b/examples/cpp/ms/initialize.h -@@ -0,0 +1,783 @@ +@@ -0,0 +1,788 @@ +#pragma once + -+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" -+#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" -+#include "src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h" -+#include "src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h" -+#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h" -+#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h" ++#include "src/fastertransformer/layers/ms_layers/MSLayerWeight.h" ++#include "src/fastertransformer/layers/ms_layers/MSAttentionLayer.h" ++#include "src/fastertransformer/layers/ms_layers/MSDecoderLayer.h" ++#include "src/fastertransformer/layers/ms_layers/MSEncoderLayer.h" +using namespace fastertransformer; +struct opt_arg { + size_t batch_size; @@ -494,7 +490,7 @@ index 0000000..db057ad + std::vector output_tensors; // GPU + std::vector output_python_tensors; // CPU + std::vector w_tensors; -+ BaseAttentionLayer* Attn; ++ BaseMSLayer* Attn; + // +}; +template @@ -504,7 +500,7 @@ index 0000000..db057ad + std::vector output_tensors; // GPU + std::vector output_python_tensors; // CPU + std::vector w_tensors; -+ BaseEncoderLayer* Encoder; ++ BaseMSLayer* Encoder; + // +}; +template @@ -514,7 +510,7 @@ index 0000000..db057ad + std::vector output_tensors; // GPU + std::vector output_python_tensors; // CPU + std::vector w_tensors; -+ BaseDecoderLayer* Decoder; ++ BaseMSLayer* Decoder; + // +}; +typedef enum { @@ -1017,6 +1013,7 @@ index 0000000..db057ad + opt_a->post_layernorm_residual, + opt_a->position_bias1, + opt_a->position_bias2, ++ opt_a->is_ffn_fp16, + stream, + cublas_wrapper, + cublas_handle, @@ -1042,11 +1039,17 @@ index 0000000..db057ad + desc.input_python_tensors.push_back(Tensor{ + MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0}); + ++ // desc.output_tensors.push_back(Tensor{ ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ // desc.output_python_tensors.push_back(Tensor{ ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ + desc.output_tensors.push_back(Tensor{ -+ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ MEMORY_GPU, getTensorType(), std::vector{640/4}, 0}); + + desc.output_python_tensors.push_back(Tensor{ -+ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ MEMORY_CPU, getTensorType(), std::vector{640/4}, 0}); + + desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ + opt_a->hidden_size}, 0}); //G1 @@ -1153,7 +1156,7 @@ index 0000000..db057ad +} + +template -+void InitWeight(opt_arg* opt_a, AttentionWeight& attn_weights, std::vector w_tensors) ++void InitWeight(opt_arg* opt_a, AttentionLayerWeight& attn_weights, std::vector w_tensors) +{ + int modelId = ModelNum(opt_a->model_name); + if (modelId == MHA_X1) { @@ -1192,10 +1195,10 @@ index 0000000..db057ad +{ + int modelId = ModelNum(opt_a->model_name); + if (modelId == TEL) { -+ encoder_weights.qkv_weight.kernel = (const T*)w_tensors[2].data; -+ encoder_weights.qkv_weight.bias = (const T*)w_tensors[3].data; -+ encoder_weights.attention_layer_output_weight.kernel = (const T*)w_tensors[4].data; -+ encoder_weights.attention_layer_output_weight.bias = (const T*)w_tensors[5].data; ++ encoder_weights.attention.query_weight.kernel = (const T*)w_tensors[2].data; ++ encoder_weights.attention.query_weight.bias = (const T*)w_tensors[3].data; ++ encoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[4].data; ++ encoder_weights.attention.attention_output_weight.bias = (const T*)w_tensors[5].data; + encoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data; + encoder_weights.layernorm1.beta = (const T*)w_tensors[1].data; + encoder_weights.layernorm2.gamma = (const T*)w_tensors[6].data; @@ -1216,18 +1219,18 @@ index 0000000..db057ad + if (modelId == TDL) { + decoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data; + decoder_weights.layernorm1.beta = (const T*)w_tensors[1].data; -+ decoder_weights.attention_qkv_weight.kernel = (const T*)w_tensors[2].data; -+ decoder_weights.attention_qkv_weight.bias = (const T*)w_tensors[3].data; -+ decoder_weights.attention_layer_output_weight.kernel = (const T*)w_tensors[4].data; -+ decoder_weights.attention_layer_output_weight.bias = (const T*)w_tensors[5].data; ++ decoder_weights.attention.query_weight.kernel = (const T*)w_tensors[2].data; ++ decoder_weights.attention.query_weight.bias = (const T*)w_tensors[3].data; ++ decoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[4].data; ++ decoder_weights.attention.attention_output_weight.bias = (const T*)w_tensors[5].data; + decoder_weights.layernorm2.gamma = (const T*)w_tensors[6].data; + decoder_weights.layernorm2.beta = (const T*)w_tensors[7].data; -+ decoder_weights.attention_cross_q_weight.kernel = (const T*)w_tensors[8].data; -+ decoder_weights.attention_cross_kv_weight.kernel = (const T*)w_tensors[9].data; -+ decoder_weights.attention_cross_kv_weight.bias = (const T*)w_tensors[10].data; -+ decoder_weights.attention_cross_q_weight.bias = (const T*)w_tensors[10].data; -+ decoder_weights.attention_cross_layer_output_weight.kernel = (const T*)w_tensors[11].data; -+ decoder_weights.attention_cross_layer_output_weight.bias = (const T*)w_tensors[12].data; ++ decoder_weights.cross_attention.query_weight.kernel = (const T*)w_tensors[8].data; ++ decoder_weights.cross_attention.key_weight.kernel = (const T*)w_tensors[9].data; ++ decoder_weights.cross_attention.query_weight.bias = (const T*)w_tensors[10].data; ++ decoder_weights.cross_attention.key_weight.bias = (const T*)w_tensors[10].data; ++ decoder_weights.cross_attention.attention_output_weight.kernel = (const T*)w_tensors[11].data; ++ decoder_weights.cross_attention.attention_output_weight.bias = (const T*)w_tensors[12].data; + decoder_weights.layernorm3.gamma = (const T*)w_tensors[13].data; + decoder_weights.layernorm3.beta = (const T*)w_tensors[14].data; + decoder_weights.decoder_output_mapping.kernel = (const T*)w_tensors[15].data; @@ -1241,10 +1244,10 @@ index 0000000..db057ad +} diff --git a/examples/cpp/ms/ms.cc b/examples/cpp/ms/ms.cc new file mode 100644 -index 0000000..b7992c6 +index 0000000..3f35c7b --- /dev/null +++ b/examples/cpp/ms/ms.cc -@@ -0,0 +1,686 @@ +@@ -0,0 +1,670 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * @@ -1261,12 +1264,10 @@ index 0000000..b7992c6 + * limitations under the License. + */ +#include "examples/cpp/ms/initialize.h" -+// #include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" -+// #include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" -+// #include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h" -+// #include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h" -+#include "src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h" -+#include "src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h" ++// #include "src/fastertransformer/layers/attention_layers/MSLayerWeight.h" ++// #include "src/fastertransformer/layers/ms_layers/MSAttentionLayer.h" ++// #include "src/fastertransformer/layers/ms_layers/MSEncoderLayer.h" ++// #include "src/fastertransformer/layers/ms_layers/MSDecoderLayer.h" +#include "src/fastertransformer/utils/logger.h" +#include +#include @@ -1696,34 +1697,20 @@ index 0000000..b7992c6 + } + if(opt_a->model_name == "transformer_decoder_layer" || opt_a->model_name == "transformer_decoder_layer_t5") { + DecriptorDecoderLayer desc; -+ InitD(opt_a, desc, stream, &cublas_wrapper, &cublas_handle, &allocator); -+ std::cout<<"input: "<(opt_a, desc, stream, &cublas_wrapper, &cublas_handle, &allocator); + int res = ReadTensors(desc.input_tensors, std::string("input"), opt_a); + FT_CHECK(!res); -+ std::cout<<"input_tensors"<(desc.input_python_tensors, std::string("input"), opt_a); + FT_CHECK(!res); -+ std::cout<<"input_python_tensors"<(desc.output_tensors, std::string("output"), opt_a, false); + FT_CHECK(!res); -+ std::cout<<"output_tensors"<(desc.output_python_tensors, std::string("output"), opt_a); + FT_CHECK(!res); -+ std::cout<<"output_python_tensors\n"; -+ + res = ReadTensors(desc.w_tensors, std::string("weight"), opt_a); + FT_CHECK(!res); -+ std::cout<<"DecoderLayerWeight\n"; -+ std::cout<<"input: "< decoder_weights; + InitWeightDecoder(opt_a, decoder_weights, desc.w_tensors); + // // test for BE !! -+ std::cout<<"initDecoderLayerWeight\n"; + desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights); + + CompareOutput(desc.output_python_tensors, desc.output_tensors); @@ -1871,7 +1858,7 @@ index 0000000..b7992c6 + std::cout << "weights size not encoder: " << CalcTensorsSize(desc.w_tensors) << std::endl; + std::cout << "ouputs size not encoder: " << CalcTensorsSize(desc.output_tensors) << std::endl; + -+ AttentionWeight attn_weights; ++ AttentionLayerWeight attn_weights; + InitWeight(opt_a, attn_weights, desc.w_tensors); + + // test for BE !! @@ -2038,10 +2025,10 @@ index 7ff8e0f..e1be64c 100644 template void invokeAddBias(float* out, const float* bias, const int m, const int n, cudaStream_t stream); diff --git a/src/fastertransformer/kernels/add_residual_kernels.cu b/src/fastertransformer/kernels/add_residual_kernels.cu -index 4cd9f0f..fe8bdf0 100644 +index 4cd9f0f..7e4c4b3 100644 --- a/src/fastertransformer/kernels/add_residual_kernels.cu +++ b/src/fastertransformer/kernels/add_residual_kernels.cu -@@ -29,6 +29,18 @@ __global__ void addBiasResidual(T* output, const T* input, const T* bias, const +@@ -29,6 +29,30 @@ __global__ void addBiasResidual(T* output, const T* input, const T* bias, const } } @@ -2056,11 +2043,23 @@ index 4cd9f0f..fe8bdf0 100644 + (S)((T)output[blockIdx.x * n + col_index] + (T)input[blockIdx.x * n + col_index] + bias_val); + } +} ++ ++template ++__global__ void addBiasResidualSameTypeCast(U* output, const U* input, T* out, const T* bias, const int m, const int n) ++{ ++ S *out_cast = (S*)out; ++ const int col_index = blockIdx.y * blockDim.x + threadIdx.x; ++ if (col_index < n) { ++ T bias_val = (bias == nullptr) ? (T)(0.0f) : bias[col_index]; ++ out_cast[blockIdx.x * n + col_index] = ++ (S)((T)output[blockIdx.x * n + col_index] + (T)input[blockIdx.x * n + col_index] + bias_val); ++ } ++} + template void invokeAddBiasResidual(T* output, const T* input, const T* bias, const int m, const int n, cudaStream_t stream) { -@@ -38,6 +50,20 @@ void invokeAddBiasResidual(T* output, const T* input, const T* bias, const int m +@@ -38,6 +62,31 @@ void invokeAddBiasResidual(T* output, const T* input, const T* bias, const int m addBiasResidual<<>>(output, input, bias, m, n); } @@ -2073,15 +2072,26 @@ index 4cd9f0f..fe8bdf0 100644 + addBiasResidualCast<<>>(output, input, out, bias, m, n); +} + ++template ++void invokeAddBiasResidualSameTypeCast(U* output, const U* input, T* out, const T* bias, const int m, const int n, cudaStream_t stream) ++{ ++ int blocks_per_row = ceil(float(n) / 1024); ++ dim3 grid(m, blocks_per_row); ++ dim3 block(min(n, 1024)); ++ addBiasResidualSameTypeCast<<>>(output, input, out, bias, m, n); ++} ++ +template void invokeAddBiasResidualCast(half* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); +template void invokeAddBiasResidualCast(float* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); +template void invokeAddBiasResidualCast(float* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); +template void invokeAddBiasResidualCast(half* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); ++ ++template void invokeAddBiasResidualSameTypeCast(half* output, const half* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream); + template __global__ void addBiasAttentionFfnResidual(T* block_output, const T* ffn_output, -@@ -88,12 +114,8 @@ void invokeAddBiasAttentionFfnResidual(T* block_output, +@@ -88,12 +137,8 @@ void invokeAddBiasAttentionFfnResidual(T* block_output, } } @@ -2097,15 +2107,18 @@ index 4cd9f0f..fe8bdf0 100644 template void invokeAddBiasResidual(__nv_bfloat16* output, const __nv_bfloat16* input, diff --git a/src/fastertransformer/kernels/add_residual_kernels.h b/src/fastertransformer/kernels/add_residual_kernels.h -index edd8179..7173964 100644 +index edd8179..75f26f9 100644 --- a/src/fastertransformer/kernels/add_residual_kernels.h +++ b/src/fastertransformer/kernels/add_residual_kernels.h -@@ -65,4 +65,8 @@ void invokeAddBiasResidualCol32(T* output, +@@ -65,4 +65,11 @@ void invokeAddBiasResidualCol32(T* output, const float* input1_amax_ptr, const int scale_is_vector = 0); +template +void invokeAddBiasResidualCast(U* output, const T* input, T* out, const T* bias, const int m, const int n, cudaStream_t stream); ++ ++template ++void invokeAddBiasResidualSameTypeCast(U* output, const U* input, T* out, const T* bias, const int m, const int n, cudaStream_t stream); + } // namespace fastertransformer + @@ -5594,19 +5607,18 @@ index be8b178..e9b4310 100644 + } // namespace fastertransformer diff --git a/src/fastertransformer/layers/CMakeLists.txt b/src/fastertransformer/layers/CMakeLists.txt -index cbaf4fa..a9fe6e6 100644 +index cbaf4fa..49779bf 100644 --- a/src/fastertransformer/layers/CMakeLists.txt +++ b/src/fastertransformer/layers/CMakeLists.txt -@@ -14,6 +14,8 @@ +@@ -14,6 +14,7 @@ cmake_minimum_required(VERSION 3.8) -+add_subdirectory(encoder_layers) -+add_subdirectory(decoder_layers) ++add_subdirectory(ms_layers) add_subdirectory(attention_layers) add_subdirectory(attention_layers_int8) add_subdirectory(xlnet_attention_layers) -@@ -30,15 +32,18 @@ set_property(TARGET FfnLayerINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) +@@ -30,15 +31,18 @@ set_property(TARGET FfnLayerINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET FfnLayerINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(FfnLayerINT8 PUBLIC -lcublasLt -lcublas -lcudart cublasMMWrapper cublasINT8MMWrapper activation_int8_kernels memory_utils) @@ -5657,7 +5669,7 @@ index b21e3a7..746cb71 100644 cublasMMWrapper* cublas_wrapper, IAllocator* allocator, diff --git a/src/fastertransformer/layers/attention_layers/CMakeLists.txt b/src/fastertransformer/layers/attention_layers/CMakeLists.txt -index 9cef315..f9c9cde 100644 +index 9cef315..7170af4 100644 --- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt +++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt @@ -42,8 +42,8 @@ target_link_libraries(DecoderSelfAttentionLayer PUBLIC -lcublas -lcudart cublasM @@ -5666,7 +5678,7 @@ index 9cef315..f9c9cde 100644 set_property(TARGET GptContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels) - -+target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels activation_kernels EncoderLayer) ++target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels activation_kernels) +if(EXAMPLES) add_library(TensorParallelDecoderSelfAttentionLayer STATIC TensorParallelDecoderSelfAttentionLayer.cc) set_property(TARGET TensorParallelDecoderSelfAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) @@ -5682,7 +5694,7 @@ index 9cef315..f9c9cde 100644 diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc old mode 100644 new mode 100755 -index bada640..e214b82 +index bada640..2415ac2 --- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc +++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc @@ -16,10 +16,39 @@ @@ -5757,151 +5769,151 @@ index bada640..e214b82 +// HAIM Playground MS-MHA + -+template -+MSMHALayer::MSMHALayer(size_t max_batch_size, -+ size_t max_src_seq_len, -+ size_t max_tgt_seq_len, -+ size_t head_num, -+ size_t size_per_head, -+ cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ IAllocator* allocator, -+ bool is_free_buffer_after_forward, -+ bool is_qk_buf_float, -+ bool is_cross, -+ bool sparse, -+ bool is_position_bias): -+ BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse) -+{ -+ cublasHandle_t cublas_handle; -+ cublasCreate(&cublas_handle); -+ cublasSetStream(cublas_handle, stream); -+ -+ params_.batch_size = max_batch_size; -+ params_.src_seq_len = max_src_seq_len; -+ params_.tgt_seq_len = max_tgt_seq_len; -+ params_.head_num = head_num; -+ params_.head_size = size_per_head; -+ params_.hidden_size = head_num * size_per_head; -+ params_.cublas_handle = cublas_handle; -+ params_.stream = stream; -+ // ctrls -+ params_.in_idx = 0; -+ params_.qkv_bias = !is_position_bias; -+ params_.projection_bias = !is_position_bias; -+ params_.is_cross = is_cross; -+ params_.position_bias = is_position_bias; -+ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; -+} -+template -+void MSMHALayer::allocateBuffer() -+{ -+ if (buf_ == nullptr) { -+ size_t buff_size = GetAttnWorkspaceSize(¶ms_); -+ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, buff_size, true)); -+ } -+} -+template -+void MSMHALayer::forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const AttentionWeight* attention_weights) -+{ -+ // input_tensors: use 1 gemm -- multi head attention -+ // input_query [batch_size * seq_len, hidden_dimension] -+ // attention_mask [batch_size, 1, seq_len, seq_len] -+ -+ // input_tensors: use 2 gemm -- cross attention -+ // input_query [batch_size * seq_len, hidden_dimension] -+ // enc_output [batch_size * tgt_len, hidden_dimension] -+ // attention_mask [batch_size, 1, seq_len, seq_len] -+ -+ // output_tensors: -+ // attention_out [batch_size * seq_len, hidden_dimension] -+ // key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x] -+ // value_cache [batch, local_head_num, max_seq_len, size_per_head] -+ -+ int in_tensor_number = input_tensors->size(); -+ allocateBuffer(); // only once -+ if (params_.position_bias) -+ if (params_.is_cross) { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)input_tensors->at(1).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)attention_weights->key_weight.kernel, -+ (void*)input_tensors->at(2).data, -+ (void*)input_tensors->at(3).data, -+ (void*)attention_weights->attention_output_weight.kernel}; -+ forward_attn((T**)inputs, 7, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } -+ else { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = { -+ (void*)input_tensors->at(0).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)input_tensors->at(1).data, -+ (void*)input_tensors->at(2).data, -+ (void*)attention_weights->attention_output_weight.kernel -+ }; -+ forward_attn((T**)inputs, 5, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } -+ else { -+ if (params_.is_cross) { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)input_tensors->at(1).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)attention_weights->key_weight.kernel, -+ (void*)attention_weights->query_weight.bias, -+ (void*)input_tensors->at(2).data, -+ (void*)attention_weights->attention_output_weight.kernel, -+ (void*)attention_weights->attention_output_weight.bias -+ }; -+ forward_attn((T**)inputs, 8, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } -+ else { -+ void* outputs[] = {(void*)output_tensors->at(0).data}; -+ void* inputs[] = {(void*)input_tensors->at(0).data, -+ (void*)attention_weights->query_weight.kernel, -+ (void*)attention_weights->query_weight.bias, -+ (void*)input_tensors->at(1).data, -+ (void*)attention_weights->attention_output_weight.kernel, -+ (void*)attention_weights->attention_output_weight.bias}; -+ forward_attn((T**)inputs, 6, (T**)outputs, 1, ¶ms_, (void*)buf_); -+ } -+ } -+} -+ -+ template -+ MSMHALayer::~MSMHALayer() -+ { -+ cublas_wrapper_ = nullptr; -+ freeBuffer(); -+ } -+ -+ template -+ void MSMHALayer::freeBuffer() -+ { -+ if (buf_ != nullptr) { -+ allocator_->free(buf_); -+ buf_ = nullptr; -+ } -+ } ++// template ++// MSMHALayer::MSMHALayer(size_t max_batch_size, ++// size_t max_src_seq_len, ++// size_t max_tgt_seq_len, ++// size_t head_num, ++// size_t size_per_head, ++// cudaStream_t stream, ++// cublasMMWrapper* cublas_wrapper, ++// IAllocator* allocator, ++// bool is_free_buffer_after_forward, ++// bool is_qk_buf_float, ++// bool is_cross, ++// bool sparse, ++// bool is_position_bias): ++// BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse) ++// { ++// cublasHandle_t cublas_handle; ++// cublasCreate(&cublas_handle); ++// cublasSetStream(cublas_handle, stream); ++ ++// // params_.batch_size = max_batch_size; ++// // params_.src_seq_len = max_src_seq_len; ++// // params_.tgt_seq_len = max_tgt_seq_len; ++// // params_.head_num = head_num; ++// // params_.head_size = size_per_head; ++// // params_.hidden_size = head_num * size_per_head; ++// // params_.cublas_handle = cublas_handle; ++// // params_.stream = stream; ++// // // ctrls ++// // params_.in_idx = 0; ++// // params_.qkv_bias = !is_position_bias; ++// // params_.projection_bias = !is_position_bias; ++// // params_.is_cross = is_cross; ++// // params_.position_bias = is_position_bias; ++// // params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++// } ++// template ++// void MSMHALayer::allocateBuffer() ++// { ++// if (buf_ == nullptr) { ++// // size_t buff_size = GetAttnWorkspaceSize(¶ms_); ++// // buf_ = reinterpret_cast(allocator_->reMalloc(buf_, buff_size, true)); ++// } ++// } ++// template ++// void MSMHALayer::forward(std::vector* output_tensors, ++// const std::vector* input_tensors, ++// const AttentionWeight* attention_weights) ++// { ++// // input_tensors: use 1 gemm -- multi head attention ++// // input_query [batch_size * seq_len, hidden_dimension] ++// // attention_mask [batch_size, 1, seq_len, seq_len] ++ ++// // input_tensors: use 2 gemm -- cross attention ++// // input_query [batch_size * seq_len, hidden_dimension] ++// // enc_output [batch_size * tgt_len, hidden_dimension] ++// // attention_mask [batch_size, 1, seq_len, seq_len] ++ ++// // output_tensors: ++// // attention_out [batch_size * seq_len, hidden_dimension] ++// // key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x] ++// // value_cache [batch, local_head_num, max_seq_len, size_per_head] ++ ++// int in_tensor_number = input_tensors->size(); ++// allocateBuffer(); // only once ++// // if (params_.position_bias) ++// // if (params_.is_cross) { ++// // void* outputs[] = {(void*)output_tensors->at(0).data}; ++// // void* inputs[] = {(void*)input_tensors->at(0).data, ++// // (void*)input_tensors->at(1).data, ++// // (void*)attention_weights->query_weight.kernel, ++// // (void*)attention_weights->key_weight.kernel, ++// // (void*)input_tensors->at(2).data, ++// // (void*)input_tensors->at(3).data, ++// // (void*)attention_weights->attention_output_weight.kernel}; ++// // forward_attn((T**)inputs, 7, (T**)outputs, 1, ¶ms_, (void*)buf_); ++// // } ++// // else { ++// // void* outputs[] = {(void*)output_tensors->at(0).data}; ++// // void* inputs[] = { ++// // (void*)input_tensors->at(0).data, ++// // (void*)attention_weights->query_weight.kernel, ++// // (void*)input_tensors->at(1).data, ++// // (void*)input_tensors->at(2).data, ++// // (void*)attention_weights->attention_output_weight.kernel ++// // }; ++// // forward_attn((T**)inputs, 5, (T**)outputs, 1, ¶ms_, (void*)buf_); ++// // } ++// // else { ++// // if (params_.is_cross) { ++// // void* outputs[] = {(void*)output_tensors->at(0).data}; ++// // void* inputs[] = {(void*)input_tensors->at(0).data, ++// // (void*)input_tensors->at(1).data, ++// // (void*)attention_weights->query_weight.kernel, ++// // (void*)attention_weights->key_weight.kernel, ++// // (void*)attention_weights->query_weight.bias, ++// // (void*)input_tensors->at(2).data, ++// // (void*)attention_weights->attention_output_weight.kernel, ++// // (void*)attention_weights->attention_output_weight.bias ++// // }; ++// // forward_attn((T**)inputs, 8, (T**)outputs, 1, ¶ms_, (void*)buf_); ++// // } ++// // else { ++// // void* outputs[] = {(void*)output_tensors->at(0).data}; ++// // void* inputs[] = {(void*)input_tensors->at(0).data, ++// // (void*)attention_weights->query_weight.kernel, ++// // (void*)attention_weights->query_weight.bias, ++// // (void*)input_tensors->at(1).data, ++// // (void*)attention_weights->attention_output_weight.kernel, ++// // (void*)attention_weights->attention_output_weight.bias}; ++// // forward_attn((T**)inputs, 6, (T**)outputs, 1, ¶ms_, (void*)buf_); ++// // } ++// } ++ ++ ++// // template ++// // MSMHALayer::~MSMHALayer() ++// // { ++// // // cublas_wrapper_ = nullptr; ++// // freeBuffer(); ++// // } ++ ++// template ++// void MSMHALayer::freeBuffer() ++// { ++// if (buf_ != nullptr) { ++// allocator_->free(buf_); ++// buf_ = nullptr; ++// } ++// } + -+ template class MSMHALayer; -+ template class MSMHALayer; -+ template class MSMHALayer; -+ template class MSMHALayer; -+ template class MSMHALayer; -+ template class MSMHALayer; -+ template class MSMHALayer; -+ template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; ++ // template class MSMHALayer; + } // namespace fastertransformer diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h old mode 100644 new mode 100755 -index 92e2175..f7fa5ca +index 92e2175..39c49c0 --- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h +++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h @@ -18,7 +18,7 @@ @@ -5909,7 +5921,7 @@ index 92e2175..f7fa5ca #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" - -+#include "src/fastertransformer/layers/encoder_layers/encoder.h" ++// #include "src/fastertransformer/layers/encoder_layers/encoder.h" namespace fastertransformer { template @@ -5919,127 +5931,127 @@ index 92e2175..f7fa5ca + +// TODO(haim): Add template according to "mix" compute type (fp32, fp16) -+template -+class MSMHALayer: public BaseAttentionLayer { -+private: -+ void allocateBuffer() override; -+ void freeBuffer() override; -+ -+ using BaseAttentionLayer::is_free_buffer_after_forward_; -+ using BaseAttentionLayer::is_allocate_buffer_; -+ using BaseAttentionLayer::cublas_wrapper_; -+ using BaseAttentionLayer::allocator_; -+ -+protected: -+ using BaseAttentionLayer::stream_; -+ using BaseAttentionLayer::sparse_; -+ T* buf_ = nullptr; -+ encoderParamT params_; -+ -+public: -+ MSMHALayer(size_t batch_size, -+ size_t src_seq_len, -+ size_t tgt_seq_len, -+ size_t head_num, -+ size_t size_per_head, -+ cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ IAllocator* allocator, -+ bool is_free_buffer_after_forward, -+ bool is_qk_buf_float, -+ bool is_cross, -+ bool sparse = false, -+ bool is_position_bias=false); -+ MSMHALayer(MSMHALayer const& attention_layer); -+ virtual ~MSMHALayer(); -+ void forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const AttentionWeight* attention_weights) override; -+}; ++// template ++// class MSMHALayer: public BaseAttentionLayer { ++// private: ++// void allocateBuffer() override; ++// void freeBuffer() override; ++ ++// using BaseAttentionLayer::is_free_buffer_after_forward_; ++// using BaseAttentionLayer::is_allocate_buffer_; ++// using BaseAttentionLayer::cublas_wrapper_; ++// using BaseAttentionLayer::allocator_; ++ ++// protected: ++// using BaseAttentionLayer::stream_; ++// using BaseAttentionLayer::sparse_; ++// T* buf_ = nullptr; ++// // encoderParamT params_; ++ ++// public: ++// MSMHALayer(size_t batch_size, ++// size_t src_seq_len, ++// size_t tgt_seq_len, ++// size_t head_num, ++// size_t size_per_head, ++// cudaStream_t stream, ++// cublasMMWrapper* cublas_wrapper, ++// IAllocator* allocator, ++// bool is_free_buffer_after_forward, ++// bool is_qk_buf_float, ++// bool is_cross, ++// bool sparse = false, ++// bool is_position_bias=false); ++// MSMHALayer(MSMHALayer const& attention_layer); ++// virtual ~MSMHALayer(); ++// void forward(std::vector* output_tensors, ++// const std::vector* input_tensors, ++// const AttentionWeight* attention_weights) override; ++// }; + } // namespace fastertransformer diff --git a/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h b/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h new file mode 100644 -index 0000000..849e137 +index 0000000..0a60835 --- /dev/null +++ b/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h @@ -0,0 +1,76 @@ -+/* -+ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+ */ -+ -+#pragma once -+ -+#include -+#include -+ -+#include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" -+#include "src/fastertransformer/layers/BaseLayer.h" -+#include "src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h" -+#include "src/fastertransformer/utils/Tensor.h" -+#include "src/fastertransformer/utils/allocator.h" -+#include "src/fastertransformer/utils/cublasMMWrapper.h" -+#include "src/fastertransformer/utils/memory_utils.h" -+ -+namespace fastertransformer { -+ -+enum class DecoderLayerType { -+ UNFUSED_DECODER_LAYER, -+ FUSED_DECODER_LAYER -+}; -+ -+template -+DecoderLayerType getDecoderLayerType(size_t size_per_head, const int sm, const bool remove_padding, -+ const int max_seq_len, const bool is_fuse = true) { -+ if (std::is_same::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72) -+ && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) { -+ return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; -+ } else { -+ return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; -+ } -+} -+ -+template -+DecoderLayerType getDecoderLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, -+ const int max_seq_len, const int int8_mode) { -+ if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64 -+ && max_seq_len <= 384) { -+ return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; -+ } else { -+ return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; -+ } -+} ++// /* ++// * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++// * ++// * Licensed under the Apache License, Version 2.0 (the "License"); ++// * you may not use this file except in compliance with the License. ++// * You may obtain a copy of the License at ++// * ++// * http://www.apache.org/licenses/LICENSE-2.0 ++// * ++// * Unless required by applicable law or agreed to in writing, software ++// * distributed under the License is distributed on an "AS IS" BASIS, ++// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++// * See the License for the specific language governing permissions and ++// * limitations under the License. ++// */ ++ ++// #pragma once ++ ++// #include ++// #include ++ ++// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" ++// #include "src/fastertransformer/layers/BaseLayer.h" ++// #include "src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h" ++// #include "src/fastertransformer/utils/Tensor.h" ++// #include "src/fastertransformer/utils/allocator.h" ++// #include "src/fastertransformer/utils/cublasMMWrapper.h" ++// #include "src/fastertransformer/utils/memory_utils.h" ++ ++// namespace fastertransformer { ++ ++// enum class DecoderLayerType { ++// UNFUSED_DECODER_LAYER, ++// FUSED_DECODER_LAYER ++// }; ++ ++// template ++// DecoderLayerType getDecoderLayerType(size_t size_per_head, const int sm, const bool remove_padding, ++// const int max_seq_len, const bool is_fuse = true) { ++// if (std::is_same::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72) ++// && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) { ++// return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++// } else { ++// return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++// } ++// } + -+template -+class BaseDecoderLayer: public BaseLayer { ++// template ++// DecoderLayerType getDecoderLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, ++// const int max_seq_len, const int int8_mode) { ++// if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64 ++// && max_seq_len <= 384) { ++// return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++// } else { ++// return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER; ++// } ++// } + -+public: -+ virtual void forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const DecoderLayerWeight* decoder_layer_weights) = 0; -+ BaseDecoderLayer(cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ IAllocator* allocator, -+ bool is_free_buffer_after_forward, -+ bool sparse = false): -+ BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse) -+ { -+ } -+ virtual ~BaseDecoderLayer() = default; -+}; -+} // namespace fastertransformer ++// template ++// class BaseDecoderLayer: public BaseLayer { ++ ++// public: ++// virtual void forward(std::vector* output_tensors, ++// const std::vector* input_tensors, ++// const DecoderLayerWeight* decoder_layer_weights) = 0; ++// BaseDecoderLayer(cudaStream_t stream, ++// cublasMMWrapper* cublas_wrapper, ++// IAllocator* allocator, ++// bool is_free_buffer_after_forward, ++// bool sparse = false): ++// BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse) ++// { ++// } ++// virtual ~BaseDecoderLayer() = default; ++// }; ++// } // namespace fastertransformer diff --git a/src/fastertransformer/layers/decoder_layers/CMakeLists.txt b/src/fastertransformer/layers/decoder_layers/CMakeLists.txt new file mode 100644 index 0000000..e343db9 @@ -6069,7 +6081,7 @@ index 0000000..e343db9 + layernorm_kernels add_residual_kernels bert_preprocess_kernels) diff --git a/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h b/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h new file mode 100644 -index 0000000..5c73512 +index 0000000..bd31438 --- /dev/null +++ b/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h @@ -0,0 +1,37 @@ @@ -6095,27 +6107,27 @@ index 0000000..5c73512 +#include "src/fastertransformer/kernels/layernorm_kernels.h" +namespace fastertransformer { + -+template -+struct DecoderLayerWeight { -+ DenseWeight attention_qkv_weight; -+ DenseWeight attention_layer_output_weight; -+ DenseWeight attention_cross_q_weight; -+ DenseWeight attention_cross_kv_weight; -+ DenseWeight attention_cross_layer_output_weight; -+ DenseWeight decoder_output_mapping; -+ DenseWeight decoder_output_projection; -+ LayerNormWeight layernorm1; -+ LayerNormWeight layernorm2; -+ LayerNormWeight layernorm3; -+}; ++// template ++// struct DecoderLayerWeight { ++// DenseWeight attention_qkv_weight; ++// DenseWeight attention_layer_output_weight; ++// DenseWeight attention_cross_q_weight; ++// DenseWeight attention_cross_kv_weight; ++// DenseWeight attention_cross_layer_output_weight; ++// DenseWeight decoder_output_mapping; ++// DenseWeight decoder_output_projection; ++// LayerNormWeight layernorm1; ++// LayerNormWeight layernorm2; ++// LayerNormWeight layernorm3; ++// }; + +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc new file mode 100644 -index 0000000..c4ca79b +index 0000000..ae8875d --- /dev/null +++ b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc -@@ -0,0 +1,207 @@ +@@ -0,0 +1,208 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. @@ -6170,6 +6182,7 @@ index 0000000..c4ca79b + bool post_layernorm, + bool position_bias1, + bool position_bias2, ++ bool is_ffn_fp16, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + cublasHandle_t* cublas_handle, @@ -6194,7 +6207,7 @@ index 0000000..c4ca79b + // handle + params_.cublas_handle = *cublas_handle; + params_.stream = stream; -+ params_.ffn_fp16 = false; ++ params_.ffn_fp16 = is_ffn_fp16; + // ctrls + params_.in_idx = 0; + params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; @@ -6325,89 +6338,90 @@ index 0000000..c4ca79b +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h new file mode 100644 -index 0000000..3f7e9cb +index 0000000..8908141 --- /dev/null +++ b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h -@@ -0,0 +1,73 @@ -+/* -+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -+ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+ */ -+ -+#pragma once -+ -+#include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h" -+#include "src/fastertransformer/layers/decoder_layers/decoder.h" -+ -+namespace fastertransformer { -+ -+// TODO(haim): Add template according to "mix" compute type (fp32, fp16) -+template -+class MSDLayer: public BaseDecoderLayer { -+private: -+ mutable decoderParamT params_; -+ -+ void allocateBuffer() override; -+ void freeBuffer() override; -+ void* buf_; -+ using BaseDecoderLayer::is_free_buffer_after_forward_; -+ using BaseDecoderLayer::is_allocate_buffer_; -+ using BaseDecoderLayer::cublas_wrapper_; -+ using BaseDecoderLayer::allocator_; -+ -+protected: -+ using BaseDecoderLayer::stream_; -+ using BaseDecoderLayer::sparse_; -+ -+public: -+ MSDLayer(size_t max_batch_size, -+ size_t max_src_seq_len, -+ size_t max_tgt_seq_len, -+ size_t head_num, -+ size_t size_per_head, -+ size_t ffn_hidden_size, -+ float eps1, -+ float eps2, -+ float eps3, -+ bool post_layernorm, -+ bool position_bias1, -+ bool position_bias2, -+ cudaStream_t stream, -+ cublasMMWrapper* cublas_wrapper, -+ cublasHandle_t* cublas_handle, -+ IAllocator* allocator, -+ bool is_free_buffer_after_forward, -+ bool is_qk_buf_float, -+ bool sparse); -+ -+ MSDLayer(MSDLayer const& decoder_layer); -+ -+ virtual ~MSDLayer(); -+ -+ void forward(std::vector* output_tensors, -+ const std::vector* input_tensors, -+ const DecoderLayerWeight* decoder_weights) override; -+}; -+ -+} // namespace fastertransformer +@@ -0,0 +1,74 @@ ++// /* ++// * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++// * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++// * ++// * Licensed under the Apache License, Version 2.0 (the "License"); ++// * you may not use this file except in compliance with the License. ++// * You may obtain a copy of the License at ++// * ++// * http://www.apache.org/licenses/LICENSE-2.0 ++// * ++// * Unless required by applicable law or agreed to in writing, software ++// * distributed under the License is distributed on an "AS IS" BASIS, ++// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++// * See the License for the specific language governing permissions and ++// * limitations under the License. ++// */ ++ ++// #pragma once ++ ++// #include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h" ++// #include "src/fastertransformer/layers/decoder_layers/decoder.h" ++ ++// namespace fastertransformer { ++ ++// // TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++// template ++// class MSDLayer: public BaseDecoderLayer { ++// private: ++// mutable decoderParamT params_; ++ ++// void allocateBuffer() override; ++// void freeBuffer() override; ++// void* buf_; ++// using BaseDecoderLayer::is_free_buffer_after_forward_; ++// using BaseDecoderLayer::is_allocate_buffer_; ++// using BaseDecoderLayer::cublas_wrapper_; ++// using BaseDecoderLayer::allocator_; ++ ++// protected: ++// using BaseDecoderLayer::stream_; ++// using BaseDecoderLayer::sparse_; ++ ++// public: ++// MSDLayer(size_t max_batch_size, ++// size_t max_src_seq_len, ++// size_t max_tgt_seq_len, ++// size_t head_num, ++// size_t size_per_head, ++// size_t ffn_hidden_size, ++// float eps1, ++// float eps2, ++// float eps3, ++// bool post_layernorm, ++// bool position_bias1, ++// bool position_bias2, ++// bool is_ffn_fp16, ++// cudaStream_t stream, ++// cublasMMWrapper* cublas_wrapper, ++// cublasHandle_t* cublas_handle, ++// IAllocator* allocator, ++// bool is_free_buffer_after_forward, ++// bool is_qk_buf_float, ++// bool sparse); ++ ++// MSDLayer(MSDLayer const& decoder_layer); ++ ++// virtual ~MSDLayer(); ++ ++// void forward(std::vector* output_tensors, ++// const std::vector* input_tensors, ++// const DecoderLayerWeight* decoder_weights) override; ++// }; ++ ++// } // namespace fastertransformer diff --git a/src/fastertransformer/layers/decoder_layers/decoder.cc b/src/fastertransformer/layers/decoder_layers/decoder.cc new file mode 100644 -index 0000000..9b20c74 +index 0000000..4d65ec8 --- /dev/null +++ b/src/fastertransformer/layers/decoder_layers/decoder.cc -@@ -0,0 +1,543 @@ +@@ -0,0 +1,506 @@ + +#include "src/fastertransformer/layers/decoder_layers/decoder.h" +#include "src/fastertransformer/kernels/activation_kernels.h" @@ -6501,7 +6515,7 @@ index 0000000..9b20c74 +template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); + +template -+void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, decoderParamT* param, void* ws) ++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws) +{ + size_t inter_size = param->ffn_hidden_size; + size_t h_token_num = param->batch_size * param->src_seq_len; @@ -6556,22 +6570,17 @@ index 0000000..9b20c74 +template +void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws) +{ -+ std::cout<<"param->layernorm_post"<layernorm_post<in_idx = 0; + size_t h_token_num = param->batch_size * param->src_seq_len; + T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); + T* attn_out = reinterpret_cast(ws); + T* normed_from_tensor = reinterpret_cast(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ T* normed_attn_out = (param->layernorm_post) ? normed_from_tensor + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE * 2 : normed_from_tensor; -+ T* attn_ws_offset = (param->layernorm_post) ? reinterpret_cast(normed_attn_out) : reinterpret_cast(normed_from_tensor); -+ T* attn_ws = attn_ws_offset + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ ++ T* attn_ws = reinterpret_cast(normed_from_tensor) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn_out = normed_from_tensor; + T* attn2_out = reinterpret_cast(attn_ws) + GetAttnWorkspaceSize(param); + T* normed_from_tensor2 = reinterpret_cast(attn2_out) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ T* normed_attn2_out = (param->layernorm_post) ? normed_from_tensor2 + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE * 2 : normed_from_tensor2; -+ T* attn2_ws_offset = (param->layernorm_post) ? reinterpret_cast(normed_attn2_out) : reinterpret_cast(normed_from_tensor2); -+ T* attn2_ws = attn2_ws_offset + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ ++ T* attn2_ws = reinterpret_cast(normed_from_tensor2) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn2_out = normed_from_tensor2; + T* ffn_ws = normed_attn2_out + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; + T* tmp_out = reinterpret_cast(output[0]); + if (std::is_same::value && param->ffn_fp16==true) { @@ -6590,16 +6599,14 @@ index 0000000..9b20c74 + inputs[--param->in_idx] = normed_from_tensor; + // if attention is embedded inside an decoder - fuse the bias to next layer normalization + int in_idx = param->in_idx; -+ std::cout<<"1"<(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws); -+ std::cout<<"2"<in_idx = param->attn1.in_idx + in_idx; + if (param->projection_bias) { + T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); + T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); + T* beta2 = reinterpret_cast(inputs[param->in_idx++]); -+ if (param->layernorm_post == false) { -+ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, ++ from_tensor = param->layernorm_post ? normed_from_tensor : from_tensor; ++ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, + normed_attn_out, + from_tensor, + gamma2, // gamma @@ -6609,27 +6616,12 @@ index 0000000..9b20c74 + param->hidden_size, + param->stream, + param->eps2); -+ std::cout<<"3"<hidden_size, -+ param->stream, -+ param->eps2); -+ } -+ } -+ else { ++ } else { + // without projection bias + } + inputs[--param->in_idx] = normed_attn_out; + in_idx = param->in_idx; + forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws); -+ std::cout<<"4"<in_idx = param->attn2.in_idx + in_idx; + if (param->projection_bias) { + T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); @@ -6646,7 +6638,6 @@ index 0000000..9b20c74 + param->hidden_size, + param->stream, + param->eps3); -+ std::cout<<"5"<(attn2_out, @@ -6658,58 +6649,44 @@ index 0000000..9b20c74 + h_token_num, + param->hidden_size, + param->stream, -+ param->eps3); -+ -+ } -+ } -+ else { ++ param->eps3); ++ } ++ } else { + // without projection bias + } + inputs[--param->in_idx] = normed_attn2_out; + if (param->ffn_fp16==false) { + forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); -+ std::cout<<"6"<(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); + } -+ if (param->layernorm_post == true) { -+ if (std::is_same::value || param->ffn_fp16==false) { -+ invokeAddBiasResidual(reinterpret_cast(tmp_out), -+ normed_attn2_out, -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias -+ h_token_num, -+ param->hidden_size, -+ param->stream); -+ } -+ else { -+ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), -+ reinterpret_cast(attn2_out), -+ reinterpret_cast(output[0]), -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias -+ h_token_num, -+ param->hidden_size, -+ param->stream); -+ } ++ attn2_out = param->layernorm_post ? normed_attn2_out : attn2_out; ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidual(reinterpret_cast(tmp_out), ++ attn2_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); + } else { -+ if (std::is_same::value || param->ffn_fp16==false) { -+ invokeAddBiasResidual(reinterpret_cast(tmp_out), -+ attn2_out, -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias -+ h_token_num, -+ param->hidden_size, -+ param->stream); -+ } -+ else { ++ if(param->layernorm_post){ ++ invokeAddBiasResidualSameTypeCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } else{ + invokeAddBiasResidualCast(reinterpret_cast(tmp_out), -+ reinterpret_cast(attn2_out), -+ reinterpret_cast(output[0]), -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias -+ h_token_num, -+ param->hidden_size, -+ param->stream); ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); + } + } -+ + return; +} + @@ -6945,18 +6922,18 @@ index 0000000..9b20c74 +forward_attn(half* inputs[], int in_len, half* output[], int out_len, attentionParamT* param, void* ws); + +template void -+forward_ffn(float* inputs[], int in_len, float* output[], int out_len, decoderParamT* param, void* ws); ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws); +template void -+forward_ffn(half* inputs[], int in_len, half* output[], int out_len, decoderParamT* param, void* ws); ++forward_ffn(half* inputs[], int in_len, half* output[], int out_len, ParamT* param, void* ws); +template void -+forward_ffn(float* inputs[], int in_len, float* output[], int out_len, decoderParamT* param, void* ws); ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws); +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/decoder_layers/decoder.h b/src/fastertransformer/layers/decoder_layers/decoder.h new file mode 100644 -index 0000000..750b9b0 +index 0000000..c302ea8 --- /dev/null +++ b/src/fastertransformer/layers/decoder_layers/decoder.h -@@ -0,0 +1,70 @@ +@@ -0,0 +1,112 @@ +#pragma once + +#include "src/fastertransformer/kernels/activation_kernels.h" @@ -6966,64 +6943,106 @@ index 0000000..750b9b0 + +namespace fastertransformer { + -+ -+typedef struct { -+ size_t batch_size; -+ size_t src_seq_len; -+ size_t tgt_seq_len; -+ size_t head_num; -+ size_t head_size; -+ size_t hidden_size; -+ size_t h_token_num; -+ // handle -+ cublasHandle_t cublas_handle; -+ cudaStream_t stream; -+ cublasGemmAlgo_t algo; -+ // ctrls -+ int in_idx; -+ bool qkv_bias; // ture -+ bool projection_bias; // ture -+ bool is_cross; // false -+ bool position_bias; -+ int *padding_offset; -+} attentionParamT; -+ -+typedef struct { -+ size_t batch_size; -+ size_t src_seq_len; -+ size_t tgt_seq_len; -+ size_t head_num; -+ size_t head_size; -+ size_t hidden_size; -+ size_t h_token_num; -+ size_t ffn_hidden_size; // 4 * param->hidden_size; -+ bool ffn_fp16; -+ float eps1; -+ float eps2; -+ float eps3; -+ // handle -+ cublasHandle_t cublas_handle; -+ cudaStream_t stream; -+ cublasGemmAlgo_t algo; -+ // ctrls -+ bool projection_bias; // ture -+ -+ int in_idx; -+ mutable attentionParamT attn1; -+ mutable attentionParamT attn2; -+ bool layernorm_post; -+ int *padding_offset; -+} decoderParamT; -+ -+template -+size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); -+ -+template -+size_t GetAttnWorkspaceSize(decoderParamT* param); -+template -+void forward_attn(T* inputs[], int in_len, T* output[], int out_len, decoderParamT* param, void* ws); -+template -+void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++// typedef struct { ++// size_t batch_size; ++// size_t src_seq_len; ++// size_t tgt_seq_len; ++// size_t head_num; ++// size_t head_size; ++// size_t hidden_size; ++// size_t h_token_num; ++// // handle ++// cublasHandle_t cublas_handle; ++// cudaStream_t stream; ++// cublasGemmAlgo_t algo; ++// // ctrls ++// int in_idx; ++// bool qkv_bias; // ture ++// bool projection_bias; // ture ++// bool is_cross; // false ++// bool position_bias; ++// int *padding_offset; ++// } attentionParamT; ++ ++// typedef struct { ++// size_t batch_size; ++// size_t src_seq_len; ++// size_t tgt_seq_len; ++// size_t head_num; ++// size_t head_size; ++// size_t hidden_size; ++// size_t h_token_num; ++// size_t ffn_hidden_size; // 4 * param->hidden_size; ++// bool ffn_fp16; ++// float eps1; ++// float eps2; ++// float eps3; ++// // handle ++// cublasHandle_t cublas_handle; ++// cudaStream_t stream; ++// cublasGemmAlgo_t algo; ++// // ctrls ++// bool projection_bias; // ture ++ ++// int in_idx; ++// mutable attentionParamT attn1; ++// mutable attentionParamT attn2; ++// bool layernorm_post; ++// int *padding_offset; ++// } decoderParamT; ++// typedef struct{ ++// public: ++// size_t batch_size; ++// size_t src_seq_len; ++// size_t tgt_seq_len; ++// size_t head_num; ++// size_t head_size; ++// size_t hidden_size; ++// size_t h_token_num; ++// size_t ffn_hidden_size; ++// // handle ++// cublasHandle_t cublas_handle; ++// cudaStream_t stream; ++// cublasGemmAlgo_t algo; ++// // ctrls ++// int *padding_offset; ++// int in_idx; ++ ++// } ParamT; ++ ++// typedef struct : ParamT{ ++ ++// // ctrls ++// bool qkv_bias; // ture ++// bool projection_bias; // ture ++// bool is_cross; // false ++// bool position_bias; ++// int *padding_offset; ++// } attentionParamT; ++ ++// typedef struct : ParamT{ ++ ++// bool ffn_fp16; ++// float eps1; ++// float eps2; ++// float eps3; ++ ++// bool projection_bias; // ture ++ ++// mutable attentionParamT attn1; ++// mutable attentionParamT attn2; ++// bool layernorm_post; ++// int *padding_offset; ++// } decoderParamT; ++// template ++// size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++// template ++// size_t GetAttnWorkspaceSize(decoderParamT* param); ++// template ++// void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws); ++// template ++// void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); +// void forwardDecoder(std::vector > const* +// inputs); +} // namespace fastertransformer @@ -7424,10 +7443,10 @@ index 0000000..af2a82c +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/encoder_layers/encoder.cc b/src/fastertransformer/layers/encoder_layers/encoder.cc new file mode 100644 -index 0000000..f33eeb2 +index 0000000..45d4dde --- /dev/null +++ b/src/fastertransformer/layers/encoder_layers/encoder.cc -@@ -0,0 +1,647 @@ +@@ -0,0 +1,646 @@ + +#include "src/fastertransformer/layers/encoder_layers/encoder.h" +#include "src/fastertransformer/kernels/activation_kernels.h" @@ -7681,7 +7700,6 @@ index 0000000..f33eeb2 +void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws) +{ + param->in_idx = 0; -+ std::cout<<"fp16"<ffn_fp16<batch_size * param->src_seq_len; + T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); + T* attn_out = reinterpret_cast(ws); @@ -8129,6 +8147,2132 @@ index 0000000..0caaed1 +// void forwardEncoder(std::vector > const* +// inputs); +} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/BaseMSLayer.h b/src/fastertransformer/layers/ms_layers/BaseMSLayer.h +new file mode 100644 +index 0000000..1cf472d +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/BaseMSLayer.h +@@ -0,0 +1,76 @@ ++/* ++ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include ++#include ++ ++#include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" ++#include "src/fastertransformer/layers/BaseLayer.h" ++#include "src/fastertransformer/utils/Tensor.h" ++#include "src/fastertransformer/utils/allocator.h" ++#include "src/fastertransformer/utils/cublasMMWrapper.h" ++#include "src/fastertransformer/utils/memory_utils.h" ++#include "src/fastertransformer/layers/ms_layers/MSLayerWeight.h" ++ ++namespace fastertransformer { ++ ++enum class MSLayerType { ++ UNFUSED_MS_LAYER, ++ FUSED_MS_LAYER ++}; ++ ++template ++MSLayerType getMSLayerType(size_t size_per_head, const int sm, const bool remove_padding, ++ const int max_seq_len, const bool is_fuse = true) { ++ if (std::is_same::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72) ++ && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) { ++ return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER; ++ } else { ++ return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER; ++ } ++} ++ ++template ++MSLayerType getMSLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, ++ const int max_seq_len, const int int8_mode) { ++ if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64 ++ && max_seq_len <= 384) { ++ return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER; ++ } else { ++ return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER; ++ } ++} ++ ++template ++class BaseMSLayer: public BaseLayer { ++ ++public: ++ virtual void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* encoder_layer_weights) = 0; ++ BaseMSLayer(cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool sparse = false): ++ BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse) ++ { ++ } ++ virtual ~BaseMSLayer() = default; ++}; ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/CMakeLists.txt b/src/fastertransformer/layers/ms_layers/CMakeLists.txt +new file mode 100644 +index 0000000..36abaf8 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/CMakeLists.txt +@@ -0,0 +1,21 @@ ++# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++ ++cmake_minimum_required(VERSION 3.8) ++ ++add_library(MSLayer STATIC MSDecoderLayer.cc MSEncoderLayer.cc MSAttentionLayer.cc decoder.cc encoder.cc ffn.cc gemm.cc attention.cc) ++set_property(TARGET MSLayer PROPERTY POSITION_INDEPENDENT_CODE ON) ++set_property(TARGET MSLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) ++target_link_libraries(MSLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels ++ layernorm_kernels add_residual_kernels bert_preprocess_kernels) +diff --git a/src/fastertransformer/layers/ms_layers/MSAttentionLayer.cc b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.cc +new file mode 100755 +index 0000000..da9a1b8 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.cc +@@ -0,0 +1,171 @@ ++/* ++ * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "src/fastertransformer/layers/ms_layers/MSAttentionLayer.h" ++ ++namespace fastertransformer { ++ ++template ++static void printTensor(char* str, T* input, int size) ++{ ++ printf("%s ", str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++ ++template ++MSMHALayer::MSMHALayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool is_cross, ++ bool sparse, ++ bool is_position_bias): ++ BaseMSLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse) ++{ ++ cublasHandle_t cublas_handle; ++ cublasCreate(&cublas_handle); ++ cublasSetStream(cublas_handle, stream); ++ ++ params_.batch_size = max_batch_size; ++ params_.src_seq_len = max_src_seq_len; ++ params_.tgt_seq_len = max_tgt_seq_len; ++ params_.head_num = head_num; ++ params_.head_size = size_per_head; ++ params_.hidden_size = head_num * size_per_head; ++ params_.cublas_handle = cublas_handle; ++ params_.stream = stream; ++ // ctrls ++ params_.in_idx = 0; ++ params_.qkv_bias = !is_position_bias; ++ params_.projection_bias = !is_position_bias; ++ params_.is_cross = is_cross; ++ params_.position_bias = is_position_bias; ++ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++} ++template ++void MSMHALayer::allocateBuffer() ++{ ++ if (buf_ == nullptr) { ++ size_t buff_size = fastertransformer::GetAttnWorkspaceSize(¶ms_); ++ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, buff_size, true)); ++ } ++} ++template ++void MSMHALayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) ++{ ++ const AttentionLayerWeight* attention_weights = dynamic_cast*>(weights); ++ if(attention_weights == NULL){ ++ std::cout<<"cast EncoderLayerWeight not sucsses"; ++ } ++ allocateBuffer(); // only once ++ if (params_.position_bias) ++ if (params_.is_cross) { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)input_tensors->at(1).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)attention_weights->key_weight.kernel, ++ (void*)input_tensors->at(2).data, ++ (void*)input_tensors->at(3).data, ++ (void*)attention_weights->attention_output_weight.kernel}; ++ fastertransformer::forward_attn((T**)inputs, 7, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ else { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = { ++ (void*)input_tensors->at(0).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)input_tensors->at(1).data, ++ (void*)input_tensors->at(2).data, ++ (void*)attention_weights->attention_output_weight.kernel ++ }; ++ fastertransformer::forward_attn((T**)inputs, 5, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ else { ++ if (params_.is_cross) { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)input_tensors->at(1).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)attention_weights->key_weight.kernel, ++ (void*)attention_weights->query_weight.bias, ++ (void*)input_tensors->at(2).data, ++ (void*)attention_weights->attention_output_weight.kernel, ++ (void*)attention_weights->attention_output_weight.bias ++ }; ++ fastertransformer::forward_attn((T**)inputs, 8, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ else { ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)attention_weights->query_weight.kernel, ++ (void*)attention_weights->query_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)attention_weights->attention_output_weight.kernel, ++ (void*)attention_weights->attention_output_weight.bias}; ++ fastertransformer::forward_attn((T**)inputs, 6, (T**)outputs, 1, ¶ms_, (void*)buf_); ++ } ++ } ++} ++ ++ template ++ MSMHALayer::~MSMHALayer() ++ { ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++ } ++ ++ template ++ void MSMHALayer::freeBuffer() ++ { ++ if (buf_ != nullptr) { ++ allocator_->free(buf_); ++ buf_ = nullptr; ++ } ++ } ++ ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ template class MSMHALayer; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSAttentionLayer.h b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.h +new file mode 100755 +index 0000000..299ffb6 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.h +@@ -0,0 +1,63 @@ ++/* ++ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/ms_layers/BaseMSLayer.h" ++#include "src/fastertransformer/layers/ms_layers/attention.h" ++namespace fastertransformer { ++ ++// TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++template ++class MSMHALayer: public BaseMSLayer { ++private: ++ void allocateBuffer() override; ++ void freeBuffer() override; ++ ++ using BaseMSLayer::is_free_buffer_after_forward_; ++ using BaseMSLayer::is_allocate_buffer_; ++ using BaseMSLayer::cublas_wrapper_; ++ using BaseMSLayer::allocator_; ++ ++protected: ++ using BaseMSLayer::stream_; ++ using BaseMSLayer::sparse_; ++ T* buf_ = nullptr; ++ attentionParamT params_; ++ ++public: ++ MSMHALayer(size_t batch_size, ++ size_t src_seq_len, ++ size_t tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool is_cross, ++ bool sparse = false, ++ bool is_position_bias=false); ++ MSMHALayer(MSMHALayer const& attention_layer); ++ virtual ~MSMHALayer(); ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) override; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc +new file mode 100644 +index 0000000..f5f6815 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc +@@ -0,0 +1,192 @@ ++/* ++ * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "src/fastertransformer/layers/ms_layers/MSDecoderLayer.h" ++ ++namespace fastertransformer { ++template ++void printTensor(char* str, T* input, int size) ++{ ++ printf("%s ", str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++template ++MSDLayer::MSDLayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ float eps3, ++ bool post_layernorm, ++ bool position_bias1, ++ bool position_bias2, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse): ++ ++ BaseMSLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr) ++{ ++ params_.batch_size = max_batch_size; ++ params_.src_seq_len = max_src_seq_len; ++ params_.tgt_seq_len = max_tgt_seq_len; ++ params_.head_num = head_num; ++ params_.head_size = size_per_head; ++ params_.hidden_size = head_num * size_per_head; ++ params_.ffn_hidden_size = ffn_hidden_size; ++ params_.eps1 = eps1; ++ params_.eps2 = eps2; ++ params_.eps3 = eps3; ++ params_.layernorm_post = post_layernorm; ++ // handle ++ params_.cublas_handle = *cublas_handle; ++ params_.stream = stream; ++ params_.ffn_fp16 = is_ffn_fp16; ++ // ctrls ++ params_.in_idx = 0; ++ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.projection_bias = true; ++ ++ params_.attn1.in_idx = 0; ++ params_.attn1.batch_size = max_batch_size; ++ params_.attn1.src_seq_len = max_src_seq_len; ++ params_.attn1.tgt_seq_len = max_tgt_seq_len; ++ params_.attn1.head_num = head_num; ++ params_.attn1.head_size = size_per_head; ++ params_.attn1.hidden_size = head_num * size_per_head; ++ params_.attn1.qkv_bias = true; ++ params_.attn1.projection_bias = false; ++ params_.attn1.is_cross = false; ++ params_.attn1.position_bias = false; ++ params_.attn1.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn1.cublas_handle = *cublas_handle; ++ params_.attn1.stream = stream; ++ ++ params_.attn2.in_idx = 0; ++ params_.attn2.batch_size = max_batch_size; ++ params_.attn2.src_seq_len = max_src_seq_len; ++ params_.attn2.tgt_seq_len = max_tgt_seq_len; ++ params_.attn2.head_num = head_num; ++ params_.attn2.head_size = size_per_head; ++ params_.attn2.hidden_size = head_num * size_per_head; ++ params_.attn2.qkv_bias = true; ++ params_.attn2.projection_bias = false; ++ params_.attn2.is_cross = true; ++ params_.attn2.position_bias = false; ++ params_.attn2.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn2.cublas_handle = *cublas_handle; ++ params_.attn2.stream = stream; ++} ++ ++template ++void MSDLayer::allocateBuffer() ++{ ++ if (buf_ == nullptr) { ++ size_t buff_size = fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); ++ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, buff_size, true)); ++ } ++} ++ ++template ++void MSDLayer::freeBuffer() ++{ ++ if (buf_ != nullptr) { ++ allocator_->free(buf_); ++ buf_ = nullptr; ++ } ++} ++ ++template ++MSDLayer::~MSDLayer() ++{ ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++} ++ ++template ++void MSDLayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) ++{ ++ const DecoderLayerWeight* decoder_weights = dynamic_cast*>(weights); ++ if(weights == NULL){ ++ std::cout<<"cast EncoderLayerWeight not sucsses"; ++ return ;} ++ allocateBuffer(); // only once ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ // std::cout<qkv_bias<< params_.attn2->qkv_bias<< !params_.attn1->position_bias<< !params_.attn2->position_bias<at(0).data, ++ (void*)decoder_weights->layernorm1.gamma, ++ (void*)decoder_weights->layernorm1.beta, ++ (void*)decoder_weights->attention.query_weight.kernel, ++ (void*)decoder_weights->attention.query_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)decoder_weights->attention.attention_output_weight.kernel, ++ (void*)decoder_weights->attention.attention_output_weight.bias, ++ (void*)decoder_weights->layernorm2.gamma, ++ (void*)decoder_weights->layernorm2.beta, ++ (void*)input_tensors->at(2).data, ++ (void*)decoder_weights->cross_attention.query_weight.kernel, ++ (void*)decoder_weights->cross_attention.key_weight.kernel, ++ (void*)decoder_weights->cross_attention.query_weight.bias, ++ (void*)input_tensors->at(3).data, ++ (void*)decoder_weights->cross_attention.attention_output_weight.kernel, ++ (void*)decoder_weights->cross_attention.attention_output_weight.bias, ++ (void*)decoder_weights->layernorm3.gamma, ++ (void*)decoder_weights->layernorm3.beta, ++ (void*)decoder_weights->decoder_output_mapping.kernel, ++ (void*)decoder_weights->decoder_output_mapping.bias, ++ (void*)decoder_weights->decoder_output_projection.kernel, ++ (void*)decoder_weights->decoder_output_projection.bias}; ++ fastertransformer::forwardDecoder(inputs, 23, outputs, 1, ¶ms_, buf_); ++ } ++ else{} ++ return; ++} ++ ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++template class MSDLayer; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSDecoderLayer.h b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.h +new file mode 100644 +index 0000000..3a31bc4 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.h +@@ -0,0 +1,74 @@ ++/* ++ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/ms_layers/BaseMSLayer.h" ++#include "src/fastertransformer/layers/ms_layers/decoder.h" ++ ++namespace fastertransformer { ++ ++// TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++template ++class MSDLayer: public BaseMSLayer { ++private: ++ mutable decoderParamT params_; ++ ++ void allocateBuffer() override; ++ void freeBuffer() override; ++ void* buf_; ++ using BaseMSLayer::is_free_buffer_after_forward_; ++ using BaseMSLayer::is_allocate_buffer_; ++ using BaseMSLayer::cublas_wrapper_; ++ using BaseMSLayer::allocator_; ++ ++protected: ++ using BaseMSLayer::stream_; ++ using BaseMSLayer::sparse_; ++ ++public: ++ MSDLayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ float eps3, ++ bool post_layernorm, ++ bool position_bias1, ++ bool position_bias2, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse); ++ ++ MSDLayer(MSDLayer const& decoder_layer); ++ ++ virtual ~MSDLayer(); ++ ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) override; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc +new file mode 100644 +index 0000000..e5275d3 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc +@@ -0,0 +1,180 @@ ++/* ++ * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#include "src/fastertransformer/layers/ms_layers/MSEncoderLayer.h" ++ ++namespace fastertransformer { ++template ++void printTensor(char* str, T* input, int size) ++{ ++ printf("%s ", str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++template ++MSELayer::MSELayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ bool post_layernorm, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse): ++ ++ BaseMSLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr) ++{ ++ params_.batch_size = max_batch_size; ++ params_.src_seq_len = max_src_seq_len; ++ params_.tgt_seq_len = max_tgt_seq_len; ++ params_.head_num = head_num; ++ params_.head_size = size_per_head; ++ params_.hidden_size = head_num * size_per_head; ++ params_.ffn_hidden_size = ffn_hidden_size; ++ params_.eps1 = eps1; ++ params_.eps2 = eps2; ++ params_.layernorm_post = post_layernorm; ++ // handle ++ params_.cublas_handle = *cublas_handle; ++ params_.stream = stream; ++ params_.ffn_fp16 = is_ffn_fp16; ++ // ctrls ++ params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ ++ params_.attn.in_idx = 0; ++ params_.attn.batch_size = max_batch_size; ++ params_.attn.src_seq_len = max_src_seq_len; ++ params_.attn.tgt_seq_len = max_tgt_seq_len; ++ params_.attn.head_num = head_num; ++ params_.attn.head_size = size_per_head; ++ params_.attn.hidden_size = head_num * size_per_head; ++ params_.attn.qkv_bias = true; ++ params_.attn.projection_bias = false; ++ params_.attn.is_cross = false; ++ params_.attn.position_bias = false; ++ params_.attn.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; ++ params_.attn.cublas_handle = *cublas_handle; ++ params_.attn.stream = stream; ++} ++ ++template ++void MSELayer::allocateBuffer() ++{ ++ if (buf_ == nullptr) { ++ size_t buff_size = fastertransformer::GetEncoderLayerWorkspaceSize(¶ms_); ++ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, sizeof(T) * buff_size, true)); ++ } ++} ++ ++template ++void MSELayer::freeBuffer() ++{ ++ if (buf_ != nullptr) { ++ allocator_->free(buf_); ++ buf_ = nullptr; ++ } ++} ++ ++template ++MSELayer::~MSELayer() ++{ ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++} ++ ++template ++void MSELayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) ++{ ++ const EncoderLayerWeight* encoder_weights = dynamic_cast*>(weights); ++ // EncoderLayerWeight* encoder_weights = dynamic_cast*>(const_cast*>(weights)); ++ // const EncoderLayerWeight* encoder_weights = dynamic_cast*>(weights); ++ if(encoder_weights == NULL){ ++ std::cout<<"cast EncoderLayerWeight not sucsses"; ++ return ;} ++ allocateBuffer(); // only once ++ void* outputs[] = {(void*)output_tensors->at(0).data}; ++ if (!params_.layernorm_post) { ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->layernorm1.beta, ++ (void*)encoder_weights->attention.query_weight.kernel, ++ (void*)encoder_weights->attention.query_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)encoder_weights->attention.attention_output_weight.kernel, ++ (void*)encoder_weights->attention.attention_output_weight.bias, ++ (void*)encoder_weights->layernorm2.gamma, ++ (void*)encoder_weights->layernorm2.beta, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_mapping.bias, ++ (void*)encoder_weights->encoder_output_projection.kernel, ++ (void*)encoder_weights->encoder_output_projection.bias}; ++ fastertransformer::forwardEncoder(inputs, 14, outputs, 1, ¶ms_, buf_); ++ } ++ else { ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)encoder_weights->attention.query_weight.kernel, ++ (void*)encoder_weights->attention.query_weight.bias, ++ (void*)input_tensors->at(1).data, ++ (void*)encoder_weights->attention.attention_output_weight.kernel, ++ (void*)encoder_weights->attention.attention_output_weight.bias, ++ (void*)encoder_weights->layernorm1.gamma, ++ (void*)encoder_weights->layernorm1.beta, ++ (void*)encoder_weights->encoder_output_mapping.kernel, ++ (void*)encoder_weights->encoder_output_mapping.bias, ++ (void*)encoder_weights->encoder_output_projection.kernel, ++ (void*)encoder_weights->encoder_output_projection.bias, ++ (void*)encoder_weights->layernorm2.gamma, ++ (void*)encoder_weights->layernorm2.beta}; ++ fastertransformer::forwardEncoder(inputs, 3, outputs, 1, ¶ms_, buf_); ++ } ++ ++ return; ++} ++ ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++template class MSELayer; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSEncoderLayer.h b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.h +new file mode 100644 +index 0000000..df0a5ab +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.h +@@ -0,0 +1,70 @@ ++/* ++ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. ++ * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/ms_layers/BaseMSLayer.h" ++#include "src/fastertransformer/layers/ms_layers/encoder.h" ++ ++namespace fastertransformer { ++ ++// TODO(haim): Add template according to "mix" compute type (fp32, fp16) ++template ++class MSELayer: public BaseMSLayer { ++private: ++ encoderParamT params_; ++ void allocateBuffer() override; ++ void freeBuffer() override; ++ void* buf_; ++ using BaseMSLayer::is_free_buffer_after_forward_; ++ using BaseMSLayer::is_allocate_buffer_; ++ using BaseMSLayer::cublas_wrapper_; ++ using BaseMSLayer::allocator_; ++ ++protected: ++ using BaseMSLayer::stream_; ++ using BaseMSLayer::sparse_; ++ ++public: ++ MSELayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ size_t ffn_hidden_size, ++ float eps1, ++ float eps2, ++ bool post_layernorm, ++ bool is_ffn_fp16, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse); ++ ++ MSELayer(MSELayer const& encoder_layer); ++ ++ virtual ~MSELayer(); ++ ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const MSLayerWeight* weights) override; ++}; ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/MSLayerWeight.h b/src/fastertransformer/layers/ms_layers/MSLayerWeight.h +new file mode 100644 +index 0000000..8915136 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/MSLayerWeight.h +@@ -0,0 +1,62 @@ ++/* ++ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include "src/fastertransformer/layers/DenseWeight.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++namespace fastertransformer { ++ ++template ++ struct MSLayerWeight{ ++ virtual ~MSLayerWeight() {} ++}; ++template ++struct AttentionLayerWeight:MSLayerWeight{ ++ DenseWeight query_weight; ++ DenseWeight key_weight; ++ DenseWeight value_weight; ++ DenseWeight attention_output_weight; ++}; ++template ++struct DecoderLayerWeight:MSLayerWeight{ ++ AttentionLayerWeight attention; ++ AttentionLayerWeight cross_attention; ++ // DenseWeight attention_qkv_weight; ++ // DenseWeight attention_layer_output_weight; ++ // DenseWeight attention_cross_q_weight; ++ // DenseWeight attention_cross_kv_weight; ++ // DenseWeight attention_cross_layer_output_weight; ++ DenseWeight decoder_output_mapping; ++ DenseWeight decoder_output_projection; ++ LayerNormWeight layernorm1; ++ LayerNormWeight layernorm2; ++ LayerNormWeight layernorm3; ++}; ++ ++template ++struct EncoderLayerWeight:MSLayerWeight{ ++ AttentionLayerWeight attention; ++ // DenseWeight qkv_weight; ++ // DenseWeight attention_layer_output_weight; ++ DenseWeight encoder_output_mapping; ++ DenseWeight encoder_output_projection; ++ LayerNormWeight layernorm1; ++ LayerNormWeight layernorm2; ++}; ++ ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/attention.cc b/src/fastertransformer/layers/ms_layers/attention.cc +new file mode 100644 +index 0000000..01d3bf3 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/attention.cc +@@ -0,0 +1,305 @@ ++ ++#include "src/fastertransformer/layers/ms_layers/attention.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include ++namespace fastertransformer { ++ ++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) ++// #define UP_DIV(x, y) (x) ++#define ALIGN_SIZE 16 ++ ++template ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++ ++template ++void isNan(char* str, T* input, int size) ++{ ++ std::cout << str << " " << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } ++ } ++ ++ std::cout << std::endl; ++ free(input_host); ++} ++ ++ ++template ++size_t GetAttnWorkspaceSize(attentionParamT* param) ++{ ++ size_t size_q = UP_DIV((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_k = UP_DIV((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_v = size_k; ++ size_t qkv_len = size_q + size_k + size_v; ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ size_t attn_out_size = ++ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T); ++ ++} ++ ++template size_t GetAttnWorkspaceSize(attentionParamT* param); ++template size_t GetAttnWorkspaceSize(attentionParamT* param); ++ ++template ++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws) ++{ ++ param->in_idx = 0; ++ auto extra_tmp_size = ++ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_q = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_k = UP_DIV(param->batch_size * param->tgt_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_v = size_k; ++ ++ size_t qkv_len = size_q + size_k + size_v; ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ auto buff_size = qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len; ++ T* qkv_buf = (T*)ws; ++ T* q_buf_2 = static_cast(qkv_buf) + qkv_len; ++ T* qk_buf = static_cast(q_buf_2) + q_buf_2_len; ++ T* qkv_buf_2 = static_cast(qk_buf) + qk_buf_len; ++ T* qkv_buf_3 = static_cast(qkv_buf_2) + qkv_buf_2_len; ++ T* output1 = static_cast(ws) + buff_size; ++ T* output2 = static_cast(output1) + extra_tmp_size; ++ int gemm_dims[] = { ++ 3 * (int)param->hidden_size, (int)param->batch_size * (int)param->src_seq_len, (int)param->hidden_size}; ++ int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size}; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; ++ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; ++ if (std::is_same::value) { ++ gemm_data_types[0] = CUDA_R_16F; ++ gemm_data_types[1] = CUDA_R_16F; ++ gemm_data_types[2] = CUDA_R_16F; ++ } ++ T alpha = 1.0f; ++ T beta = 0.0f; ++ ++ if (param->is_cross) { ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->src_seq_len; ++ gemm_dims[2] = param->hidden_size; ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = param->hidden_size; ++ gemm_lds[2] = param->hidden_size; ++ T* encoder_output = reinterpret_cast(inputs[param->in_idx++]); ++ T* weight_q = reinterpret_cast(inputs[param->in_idx++]); ++ ++ fastertransformer::CublasGemmWrapper(weight_q, ++ from_tensor, ++ qkv_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ gemm_dims[0] = 2 * param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->tgt_seq_len; ++ gemm_lds[0] = 2 * param->hidden_size; ++ gemm_lds[2] = 2 * param->hidden_size; ++ T* weight_kv = reinterpret_cast(inputs[param->in_idx++]); ++ ++ fastertransformer::CublasGemmWrapper(weight_kv, ++ encoder_output, ++ qkv_buf + (param->batch_size * param->src_seq_len) * param->hidden_size, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ ++ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ invokeCrossAddFusedQKVBiasTranspose(q_buf_2, ++ output1, ++ output2, ++ qkv_buf, ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); ++ } ++ else { ++ T* weight_qkv = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(weight_qkv, ++ from_tensor, ++ qkv_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; ++ fastertransformer::invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), ++ static_cast(output1), ++ static_cast(output2), ++ static_cast(qkv_buf), ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ 0, ++ param->stream); ++ } ++ gemm_ops[0] = CUBLAS_OP_T; ++ ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->head_size; ++ gemm_lds[2] = param->tgt_seq_len; ++ ++ int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->head_size), ++ (int)(param->src_seq_len * param->tgt_seq_len)}; ++ ++ gemm_dims[0] = param->tgt_seq_len; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->head_size; ++ ++ fastertransformer::CublasGemmStridedBatchedWrapper(output1, ++ q_buf_2, ++ qk_buf, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ ++ T* attention_mask = reinterpret_cast(inputs[param->in_idx++]); ++ T* position_bias = nullptr; ++ if (param->position_bias) { ++ position_bias = reinterpret_cast(inputs[param->in_idx++]); ++ } ++ T scalar = static_cast(1.0f / sqrtf(param->head_size * 1.0f)); ++ fastertransformer::invokeMixMaskedSoftMax(static_cast(qk_buf), ++ attention_mask, ++ position_bias, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ scalar, ++ param->stream); ++ ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->head_size; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->tgt_seq_len; ++ ++ gemm_lds[0] = param->head_size; ++ gemm_lds[1] = param->tgt_seq_len; ++ gemm_lds[2] = param->head_size; ++ ++ gemm_strides[0] = param->tgt_seq_len * param->head_size; ++ gemm_strides[1] = param->src_seq_len * param->tgt_seq_len; ++ gemm_strides[2] = param->src_seq_len * param->head_size; ++ fastertransformer::CublasGemmStridedBatchedWrapper(output2, ++ qk_buf, ++ qkv_buf_2, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_strides, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->batch_size * param->head_num, ++ param->cublas_handle, ++ param->algo); ++ ++ invokeTransposeQKV(static_cast(qkv_buf_3), ++ static_cast(qkv_buf_2), ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); ++ gemm_ops[0] = CUBLAS_OP_N; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = param->batch_size * param->src_seq_len; ++ gemm_dims[2] = param->hidden_size; ++ ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = param->hidden_size; ++ gemm_lds[2] = param->hidden_size; ++ fastertransformer::CublasGemmWrapper(reinterpret_cast(inputs[param->in_idx++]), ++ qkv_buf_3, ++ static_cast(output[0]), ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ const_cast(gemm_data_types), ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ ++ if (param->projection_bias) { ++ int len = param->batch_size * param->src_seq_len; ++ invokeAddBias( ++ static_cast(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream); ++ } ++ return; ++} ++ ++template void ++forward_attn(float* inputs[], int in_len, float* output[], int out_len, attentionParamT* param, void* ws); ++template void ++forward_attn(half* inputs[], int in_len, half* output[], int out_len, attentionParamT* param, void* ws); ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/attention.h b/src/fastertransformer/layers/ms_layers/attention.h +new file mode 100644 +index 0000000..36ac74b +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/attention.h +@@ -0,0 +1,19 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/BaseMSLayer.h" ++#include "src/fastertransformer/layers/ms_layers/param.h" ++#include "src/fastertransformer/layers/ms_layers/gemm.h" ++ ++#include ++#include ++ ++namespace fastertransformer { ++ ++ ++template ++size_t GetAttnWorkspaceSize(attentionParamT* param); ++ ++template ++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/decoder.cc b/src/fastertransformer/layers/ms_layers/decoder.cc +new file mode 100644 +index 0000000..8db6035 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/decoder.cc +@@ -0,0 +1,202 @@ ++ ++#include "src/fastertransformer/layers/decoder_layers/decoder.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/attention.h" ++#include "src/fastertransformer/layers/ms_layers/ffn.h" ++ ++#include ++namespace fastertransformer { ++ ++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) ++// #define UP_DIV(x, y) (x) ++#define ALIGN_SIZE 16 ++ ++template ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++ ++template ++void isNan(char* str, T* input, int size) ++{ ++ std::cout << str << " " << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } ++ } ++ ++ std::cout << std::endl; ++ free(input_host); ++} ++ ++template ++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param) ++{ ++ size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ size_t attn2_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ ++ size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); ++ size_t out_size = (param->layernorm_post) ? attn_out + attn2_out : attn_out * 2 + attn2_out * 2; ++ return (std::max(fastertransformer::GetAttnWorkspaceSize(&(param->attn1)) * 2, ffn_size * sizeof(T)) + out_size * sizeof(T)); ++} ++ ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++template ++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws) ++{ ++ param->in_idx = 0; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ T* attn_out = reinterpret_cast(ws); ++ T* normed_from_tensor = reinterpret_cast(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* attn_ws = reinterpret_cast(normed_from_tensor) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn_out = normed_from_tensor; ++ T* attn2_out = reinterpret_cast(attn_ws) + fastertransformer::GetAttnWorkspaceSize(&(param->attn1)); ++ T* normed_from_tensor2 = reinterpret_cast(attn2_out) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* attn2_ws = reinterpret_cast(normed_from_tensor2) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn2_out = normed_from_tensor2; ++ T* ffn_ws = attn2_ws + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* tmp_out = reinterpret_cast(output[0]); ++ if (std::is_same::value && param->ffn_fp16==true) { ++ tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ } ++ T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta1 = reinterpret_cast(inputs[param->in_idx++]); ++ invokeGeneralLayerNorm(normed_from_tensor, ++ reinterpret_cast(from_tensor), // from tensor ++ gamma1, // Gamma ++ beta1, // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ inputs[--param->in_idx] = normed_from_tensor; ++ // if attention is embedded inside an decoder - fuse the bias to next layer normalization ++ int in_idx = param->in_idx; ++ fastertransformer::forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws); ++ param->in_idx = param->attn1.in_idx + in_idx; ++ if (param->projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta2 = reinterpret_cast(inputs[param->in_idx++]); ++ from_tensor = param->layernorm_post ? normed_from_tensor : from_tensor; ++ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, ++ normed_attn_out, ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } else { ++ // without projection bias ++ } ++ inputs[--param->in_idx] = normed_attn_out; ++ in_idx = param->in_idx; ++ fastertransformer::forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws); ++ param->in_idx = param->attn2.in_idx + in_idx; ++ if (param->projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma3 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta3 = reinterpret_cast(inputs[param->in_idx++]); ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeGeneralAddBiasResidualPreLayerNorm(attn2_out, ++ normed_attn2_out, ++ attn_out, ++ gamma3, // gamma ++ beta3, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps3); ++ ++ } else { ++ invokeGeneralAddBiasResidualPreLayerNormCast(attn2_out, ++ reinterpret_cast(normed_attn2_out), ++ attn_out, ++ gamma3, // gamma ++ beta3, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps3); ++ } ++ } else { ++ // without projection bias ++ } ++ inputs[--param->in_idx] = normed_attn2_out; ++ if (param->ffn_fp16==false) { ++ fastertransformer::forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } else { ++ fastertransformer::forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } ++ attn2_out = param->layernorm_post ? normed_attn2_out : attn2_out; ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidual(reinterpret_cast(tmp_out), ++ attn2_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } else { ++ if(param->layernorm_post){ ++ invokeAddBiasResidualSameTypeCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } else{ ++ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn2_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } ++ } ++ return; ++} ++ ++template void ++forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++template void ++forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/decoder.h b/src/fastertransformer/layers/ms_layers/decoder.h +new file mode 100644 +index 0000000..7c2ea9e +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/decoder.h +@@ -0,0 +1,17 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/param.h" ++ ++#include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h" ++#include ++#include ++ ++namespace fastertransformer { ++ ++template ++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++template ++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/encoder.cc b/src/fastertransformer/layers/ms_layers/encoder.cc +new file mode 100644 +index 0000000..466d09e +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/encoder.cc +@@ -0,0 +1,229 @@ ++ ++#include "src/fastertransformer/layers/ms_layers/encoder.h" ++#include "src/fastertransformer/layers/ms_layers/attention.h" ++#include "src/fastertransformer/layers/ms_layers/ffn.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include ++namespace fastertransformer { ++ ++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) ++// #define UP_DIV(x, y) (x) ++#define ALIGN_SIZE 16 ++ ++template ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++ ++template ++void isNan(char* str, T* input, int size) ++{ ++ std::cout << str << " " << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } ++ } ++ ++ std::cout << std::endl; ++ free(input_host); ++} ++ ++template ++size_t GetEncoderLayerWorkspaceSize(encoderParamT* param) ++{ ++ size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); ++ size_t out_size = (param->layernorm_post) ? attn_out : attn_out * 2; ++ return (std::max(fastertransformer::GetAttnWorkspaceSize(&(param->attn)), ffn_size * sizeof(T)) + out_size * sizeof(T)); ++} ++ ++template size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); ++template size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); ++ ++template ++void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws) ++{ ++ param->in_idx = 0; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ T* attn_out = reinterpret_cast(ws); ++ T* normed_from_tensor = reinterpret_cast(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* attn_ws_offset = (param->layernorm_post) ? reinterpret_cast(ws) : reinterpret_cast(normed_from_tensor); ++ T* attn_ws = attn_ws_offset + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_attn_out = normed_from_tensor; ++ T* ffn_ws = normed_attn_out + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ ++ T* tmp_out = reinterpret_cast(output[0]); ++ if (std::is_same::value && param->ffn_fp16==true) { ++ tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ } ++ if (param->layernorm_post == false) { ++ T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta1 = reinterpret_cast(inputs[param->in_idx++]); ++ ++ invokeGeneralLayerNorm(normed_from_tensor, ++ reinterpret_cast(from_tensor), // from tensor ++ gamma1, // Gamma ++ beta1, // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ } ++ else { ++ normed_from_tensor = from_tensor; ++ } ++ inputs[--param->in_idx] = normed_from_tensor; ++ // if attention is embedded inside an encoder - fuse the bias to next layer normalization ++ int in_idx = param->in_idx; ++ fastertransformer::forward_attn(reinterpret_cast(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn), attn_ws); ++ param->in_idx = param->attn.in_idx + in_idx; ++ if (param->projection_bias) { ++ T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); ++ T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); ++ T* beta2 = reinterpret_cast(inputs[param->in_idx++]); ++ if (param->layernorm_post == false) { ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, ++ normed_attn_out, ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } else { ++ invokeGeneralAddBiasResidualPreLayerNormCast(attn_out, ++ reinterpret_cast(normed_attn_out), ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } ++ } else { ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidualLayerNorm( ++ attn_out, ++ from_tensor, ++ projection_bias, ++ gamma2, // gamma ++ beta2, // beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ normed_attn_out = attn_out; ++ } else { ++ invokeAddBiasResidualLayerNormCast( ++ reinterpret_cast(attn_out), ++ reinterpret_cast(normed_attn_out), ++ reinterpret_cast(from_tensor), ++ projection_bias, ++ gamma2, // gamma ++ beta2, // beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ // isNan((char*)"LN 1 model", (half*)attn_out, h_token_num * param->hidden_size); ++ } ++ } ++ } ++ else { ++ // without projection bias ++ } ++ // forward ffn ++ // simulate attention inputs ++ inputs[--param->in_idx] = normed_attn_out; ++ if (param->ffn_fp16==false) { ++ fastertransformer::forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } else { ++ fastertransformer::forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); ++ } ++ if (param->layernorm_post == true) { ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidualLayerNorm(reinterpret_cast(tmp_out), ++ attn_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, ++ reinterpret_cast(inputs[param->in_idx++]), // Gamma ++ reinterpret_cast(inputs[param->in_idx++]), // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ ++ } else { ++ invokeAddBiasResidualLayerNormCast( ++ reinterpret_cast(tmp_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(normed_attn_out), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, ++ reinterpret_cast(inputs[param->in_idx++]), // Gamma ++ reinterpret_cast(inputs[param->in_idx++]), // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } ++ } else { ++ if (std::is_same::value || param->ffn_fp16==false) { ++ invokeAddBiasResidual(reinterpret_cast(tmp_out), ++ attn_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } ++ else { ++ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), ++ reinterpret_cast(attn_out), ++ reinterpret_cast(output[0]), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias ++ h_token_num, ++ param->hidden_size, ++ param->stream); ++ } ++ } ++ ++ return; ++} ++ ++template void ++forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); ++template void ++forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/encoder.h b/src/fastertransformer/layers/ms_layers/encoder.h +new file mode 100644 +index 0000000..081ef49 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/encoder.h +@@ -0,0 +1,16 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/BaseMSLayer.h" ++#include "src/fastertransformer/layers/ms_layers/param.h" ++#include ++#include ++ ++namespace fastertransformer { ++ ++template ++size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); ++ ++template ++void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/ffn.cc b/src/fastertransformer/layers/ms_layers/ffn.cc +new file mode 100644 +index 0000000..e76d6fd +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/ffn.cc +@@ -0,0 +1,113 @@ ++ ++#include "src/fastertransformer/layers/ms_layers/ffn.h" ++#include "src/fastertransformer/layers/ms_layers/gemm.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/layernorm_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include ++namespace fastertransformer { ++ ++template ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ } ++ ++ std::cout << std::endl; ++ ++ free(input_host); ++} ++ ++template ++void isNan(char* str, T* input, int size) ++{ ++ std::cout << str << " " << " size is " << size; ++ T* input_device = input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { ++ std::cout << "found NAN or INF"; ++ break; ++ } ++ } ++ ++ std::cout << std::endl; ++ free(input_host); ++} ++ ++template ++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws) ++{ ++ size_t inter_size = param->ffn_hidden_size; ++ size_t h_token_num = param->batch_size * param->src_seq_len; ++ cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; ++ cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; ++ if ((std::is_same::value) || (std::is_same::value)) { ++ gemm_data_types[0] = CUDA_R_16F; ++ gemm_data_types[1] = CUDA_R_16F; ++ gemm_data_types[2] = CUDA_R_16F; ++ } ++ S alpha = 1.0f; ++ S beta = 0.0f; ++ ++ int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size}; ++ int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size}; ++ T* normed_attn_out = reinterpret_cast(inputs[param->in_idx++]); ++ fastertransformer::CublasGemmWrapper(inputs[param->in_idx++], ++ normed_attn_out, ++ ws, ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++ invokeAddBiasGelu(reinterpret_cast(ws), ++ reinterpret_cast(inputs[param->in_idx++]), ++ h_token_num, ++ inter_size, ++ param->stream); ++ gemm_dims[0] = param->hidden_size; ++ gemm_dims[1] = h_token_num; ++ gemm_dims[2] = inter_size; ++ gemm_lds[0] = param->hidden_size; ++ gemm_lds[1] = inter_size; ++ gemm_lds[2] = param->hidden_size; ++ fastertransformer::CublasGemmWrapper(inputs[param->in_idx++], ++ ws, ++ output[0], ++ gemm_dims, ++ gemm_lds, ++ gemm_ops, ++ gemm_data_types, ++ &alpha, ++ &beta, ++ param->cublas_handle, ++ param->algo); ++} ++ ++ ++template void ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws); ++template void ++forward_ffn(half* inputs[], int in_len, half* output[], int out_len, ParamT* param, void* ws); ++template void ++forward_ffn(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/ffn.h b/src/fastertransformer/layers/ms_layers/ffn.h +new file mode 100644 +index 0000000..196b5a1 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/ffn.h +@@ -0,0 +1,14 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/BaseMSLayer.h" ++#include "src/fastertransformer/layers/ms_layers/param.h" ++ ++#include ++#include ++ ++namespace fastertransformer { ++ ++template ++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws); ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/gemm.cc b/src/fastertransformer/layers/ms_layers/gemm.cc +new file mode 100644 +index 0000000..aabafb7 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/gemm.cc +@@ -0,0 +1,117 @@ ++ ++#include "src/fastertransformer/layers/ms_layers/gemm.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include ++namespace fastertransformer { ++ ++void CublasGemmWrapper(const void* a_addr, ++ const void* b_addr, ++ void* c_addr, ++ const int* params, ++ const int* lds, ++ const cublasOperation_t* operations, ++ const cudaDataType* data_types, ++ void* alpha, ++ void* beta, ++ cublasHandle_t cublas_handle, ++ cublasGemmAlgo_t algo) ++{ ++ const int m = params[0]; ++ const int n = params[1]; ++ const int k = params[2]; ++ cublasOperation_t trans_a = operations[0]; ++ cublasOperation_t trans_b = operations[1]; ++ const int lda = lds[0]; ++ const int ldb = lds[1]; ++ const int ldc = lds[2]; ++ cudaDataType type_a = data_types[0]; ++ cudaDataType type_b = data_types[1]; ++ cudaDataType type_c = data_types[2]; ++ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; ++ // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F; ++ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { ++ compute_type = CUBLAS_COMPUTE_16F; ++ } ++ cublasGemmEx(cublas_handle, ++ trans_a, ++ trans_b, ++ m, ++ n, ++ k, ++ alpha, ++ a_addr, ++ type_a, ++ lda, ++ b_addr, ++ type_b, ++ ldb, ++ beta, ++ c_addr, ++ type_c, ++ ldc, ++ compute_type, ++ algo); ++} ++ ++void CublasGemmStridedBatchedWrapper(const void* a_addr, ++ const void* b_addr, ++ void* c_addr, ++ const int* params, ++ const int* lds, ++ const cublasOperation_t* operations, ++ const int* strides, ++ const cudaDataType* data_types, ++ void* alpha, ++ void* beta, ++ int batch, ++ cublasHandle_t cublas_handle, ++ cublasGemmAlgo_t algo) ++{ ++ const int m = params[0]; ++ const int n = params[1]; ++ const int k = params[2]; ++ cublasOperation_t trans_a = operations[0]; ++ cublasOperation_t trans_b = operations[1]; ++ const int lda = lds[0]; ++ const int ldb = lds[1]; ++ const int ldc = lds[2]; ++ cudaDataType type_a = data_types[0]; ++ cudaDataType type_b = data_types[1]; ++ cudaDataType type_c = data_types[2]; ++ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; ++ // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F; ++ ++ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { ++ compute_type = CUBLAS_COMPUTE_16F; ++ } ++ const int stride_a = strides[0]; ++ const int stride_b = strides[1]; ++ const int stride_c = strides[2]; ++ cublasGemmStridedBatchedEx(cublas_handle, ++ trans_a, ++ trans_b, ++ m, ++ n, ++ k, ++ alpha, ++ a_addr, ++ type_a, ++ lda, ++ stride_a, ++ b_addr, ++ type_b, ++ ldb, ++ stride_b, ++ beta, ++ c_addr, ++ type_c, ++ ldc, ++ stride_c, ++ batch, ++ compute_type, ++ algo); ++} ++ ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/gemm.h b/src/fastertransformer/layers/ms_layers/gemm.h +new file mode 100644 +index 0000000..a0f6698 +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/gemm.h +@@ -0,0 +1,13 @@ ++#pragma once ++ ++#include "src/fastertransformer/kernels/activation_kernels.h" ++#include "src/fastertransformer/layers/ms_layers/BaseMSLayer.h" ++#include ++#include ++ ++namespace fastertransformer { ++ ++void CublasGemmWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const cudaDataType* data_types, void* alpha, void* beta, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); ++void CublasGemmStridedBatchedWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const int* strides, const cudaDataType* data_types, void* alpha, void* beta, int batch, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); ++ ++} // namespace fastertransformer +diff --git a/src/fastertransformer/layers/ms_layers/param.h b/src/fastertransformer/layers/ms_layers/param.h +new file mode 100644 +index 0000000..a76004c +--- /dev/null ++++ b/src/fastertransformer/layers/ms_layers/param.h +@@ -0,0 +1,52 @@ ++#pragma once ++typedef struct{ ++ public: ++ size_t batch_size; ++ size_t src_seq_len; ++ size_t tgt_seq_len; ++ size_t head_num; ++ size_t head_size; ++ size_t hidden_size; ++ size_t h_token_num; ++ size_t ffn_hidden_size; ++ // handle ++ cublasHandle_t cublas_handle; ++ cudaStream_t stream; ++ cublasGemmAlgo_t algo; ++ // ctrls ++ int *padding_offset; ++ int in_idx; ++ ++} ParamT; ++ ++typedef struct : ParamT{ ++ bool qkv_bias; // ture ++ bool projection_bias; // ture ++ bool is_cross; // false ++ bool position_bias; ++ int *padding_offset; ++} attentionParamT; ++ ++typedef struct : ParamT{ ++ ++ bool ffn_fp16; ++ float eps1; ++ float eps2; ++ float eps3; ++ bool projection_bias; // ture ++ mutable attentionParamT attn1; ++ mutable attentionParamT attn2; ++ bool layernorm_post; ++ int *padding_offset; ++} decoderParamT; ++ ++typedef struct : ParamT{ ++ ++ bool ffn_fp16; ++ float eps1; ++ float eps2; ++ bool projection_bias; // ture ++ mutable attentionParamT attn; ++ bool layernorm_post; ++ int *padding_offset; ++} encoderParamT; diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt index af33e76..97fc471 100644 --- a/src/fastertransformer/models/CMakeLists.txt diff --git a/trc/transformer/MultiHeadTester.py b/trc/transformer/MultiHeadTester.py old mode 100644 new mode 100755 index d04f3bfa842..97feeecc433 --- a/trc/transformer/MultiHeadTester.py +++ b/trc/transformer/MultiHeadTester.py @@ -1773,7 +1773,6 @@ class TransformerDecoderLayerX(Cell): cross_attn_output = None if encoder_output is not None: middle_output = self.cross_attention_layernorm(x) - middle_output = F.cast(middle_output, self.dtype) encoder_output = F.cast(encoder_output, self.dtype) cross_attn_output, cross_layer_present = self.cross_attention(middle_output, encoder_output, diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index dfedd52066a..99e4f5bd9ab 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[1,512];token_type_ids:[1,512];input_mask:[1,512] +input_shape=input_ids:[transformer_encoder_layer,128];token_type_ids:[transformer_encoder_layer,128];input_mask:[transformer_encoder_layer,128] diff --git a/trc/transformer/convert_fp32.sh b/trc/transformer/convert_fp32.sh index af5b5b1d851..054adffc284 100755 --- a/trc/transformer/convert_fp32.sh +++ b/trc/transformer/convert_fp32.sh @@ -2,7 +2,7 @@ base=`git rev-parse --show-toplevel` version=$(cat ${base}/version.txt) file_name=$(basename $1) file_name="${file_name%.*}" -#dbg="gdb --args " +# dbg="gdb --args " #GLOG_v=0 \ lib_base=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 @@ -14,6 +14,7 @@ ${dbg} ${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linu --outputFile=${base}/trc/transformer/convv_${file_name} \ --configFile=${base}/trc/transformer/t.config \ --encryption=false \ + --optimizeTransformer=true \ --exportMindIR=MINDIR if [ "${dbg}" = "" ] @@ -25,5 +26,6 @@ ${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64/t --modelFile=$1 \ --outputFile=${base}/trc/transformer/convv_${file_name} \ --configFile=${base}/trc/transformer/t.config \ - --encryption=false + --encryption=false \ + --optimizeTransformer=true fi diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index d82d9a7b26c..6a3cda3ef60 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -1,38 +1,29 @@ #!/bin/bash + base=`git rev-parse --show-toplevel` version=$(cat ${base}/version.txt) system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 benchmark=${system}/tools/benchmark/benchmark server=caspi gpu_id=2 -while getopts "c" opt ; do - case "${opt}" in - c) - compress="_compress" ;; - *) - echo "Unknown option ${opt}!" ;; - esac - done -shift $(($OPTIND - 1)) # move files to caspi model=${1%.mindir} model=${model#convv_} model=$(echo ${model}| sed 's/_fwd//') -model=$(echo ${model}| sed 's/_graph//') batch_size=$(echo ${model}| sed 's/bert//') echo "model=${model}" - model_name=$(echo ${model}) +model_name=$(echo ${model}) if [[ "$batch_size" != "${model}" ]];then - model_name='bert' + model_name='bert' fi -if [ "${batch_size}" == "" ] +if [ "${batch_size}" == "" ] then - batch_size=$(echo "1") + batch_size=$(echo "1") fi echo "batch_size=${batch_size}" echo "model_name=${model_name}" -dir1=$(dirname $(realpath $1)) -ssh ${server} "mkdir -p ${dir1}" +dir=$(dirname $(realpath $1)) +ssh ${server} "mkdir -p ${dir}" dir=$(dirname ${benchmark}) ssh ${server} "mkdir -p ${dir}" dir=${system}/runtime/lib @@ -43,15 +34,13 @@ rsync -v $1 ${server}:$(realpath $1) rsync -v ${benchmark} ${server}:${benchmark} rsync -vl ${system}/runtime/lib/* ${server}:${system}/runtime/lib/ rsync -vl ${system}/tools/converter/lib/* ${server}:${system}/tools/converter/lib/ -rsync -vr ${dir1}/convv_${model}_fwd_graph_variables ${server}:${dir1} -#echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_bert.config -rsync -v cfg_${model}.config ${server}:$(realpath "cfg_${model}.config") - +echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_bert.config +rsync -v cfg_${model_name}.config ${server}:$(realpath "cfg_${model_name}.config") # this should be more general ! + # output_files=$(find . -maxdepth 1 -name ${model}_compress_output"*.txt*" | sort -n) -output_files=$(find . -maxdepth 1 -name ${model}${compress}_output"*.txt*" | sort -n) +output_files=$(find . -maxdepth 1 -name ${model}_output"*.txt*" | sort -n) input_files=$(find . -maxdepth 1 -name ${model}_input"*.fp32" | sort -n) - rsync -v ${input_files} ${output_files} ${server}:${PWD} echo ${server}:${PWD} echo ${input_files} @@ -62,11 +51,12 @@ command+="LD_LIBRARY_PATH=${system}/runtime/lib:${system}/tools/converter/lib CU command+="${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " if [ "$2" == "" ] then - command+="--inDataFile=\"${input_files}\"" - command+=" --benchmarkDataFile=\"${output_files}\" " + command+="--inDataFile=\"${input_files}\"" + command+=" --benchmarkDataFile=\"${output_files}\" " fi -if [ -f cfg_${model}.config ]; then - command+="--configFile=cfg_${model}.config " + +if [ -f cfg_${model_name}.config ]; then + command+="--configFile=cfg_${model_name}.config " fi command+="--device=GPU " #command+="--enableFp16=true" @@ -74,3 +64,5 @@ echo command=${command} echo ${command} > execute.sh rsync -v execute.sh ${server}:${PWD} ssh ${server} ${command} + + \ No newline at end of file diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index ff3f76b309b..8e80cbdf911 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -19,7 +19,7 @@ suffix="fp32" usage='enter the correct parameters: app=ch\\trc, act=runtime\\be, loop count=int>=0, server=local\\num of server\nif app=trc and act=be loop count must be 1' app='ch' act='be' -cuda_visible_dev=3 +cuda_visible_dev=2 loop_count=1 if len(sys.argv)>2 or len(sys.argv)==1: parameters=sys.argv[1:] @@ -53,20 +53,25 @@ for i in range(len(parameters)) : print('loop count=',loop_count) inputs_file = open("models.txt") models_arg = inputs_file.readlines() -# import subprocess -def find_output_name(mindir_model, output_file): - output_name = os.popen(f"../readers/mindir/readir {mindir_model} -O").read() - print(output_name) - output_name = output_name[:-1] - print(output_name) +def find_output_name(ms_model, output_file): + os.system(f"../readers/flatbuf/readfb {ms_model} > readmodel.txt") + file = open('readmodel.txt', 'r') + lines = file.readlines() + file.close() + line_of_output = [i for i,s in enumerate(lines) if "outputs:#" in s][0] + outputs = lines[line_of_output+1].split() + outpus_name=[] + for out in outputs: + output = [i for i,s in enumerate(lines) if "tensor #"+out in s][0] + output_name = lines[output+2].split()[2] + outpus_name.append(output_name) with open(output_file, 'r') as file: data = file.read() - for i,out in enumerate([output_name]): - print(out) - data = data.replace('output'+str(i+1), out) + for i,out1 in enumerate(outpus_name): + data = data.replace('output'+str(i+1), out1) with open(output_file, 'w') as file: file.write(data) - print(output_name) + print(outpus_name) for line_model_arg in models_arg: if line_model_arg[0] == '#' or line_model_arg == '\n': continue line_model_arg=line_model_arg[:-1] @@ -91,7 +96,7 @@ for line_model_arg in models_arg: if model_name=="bert": if batch_size!='1': model_name+=batch_size - os.system(f"rm -rf {base}/trc/transformer/{model_name}* {base}/trc/transformer/convv_{model_name}*") + os.system(f"rm -f {base}/trc/transformer/{model_name}* {base}/trc/transformer/convv_{model_name}*") ret = os.system(f"docker run --user \"$(id -u):$(id -g)\" -w {base}/trc/transformer --runtime=nvidia -v {base}/../:{base}/../ -v /opt/share:/opt/share --privileged=true {image} python {base}/trc/transformer/train_transformer_export.py {line_model_arg} " ) ret=0 if ret != 0: exit() @@ -108,7 +113,7 @@ for line_model_arg in models_arg: output_file=f'{work_dir}/{model_name}_output.txt' benchmark_args=f'--modelFile={work_dir}/convv_{model_name}_fwd.mindir --loopCount={loop_count} --modelType=MindIR --inDataFile="{input_files}" --benchmarkDataFile={output_file} --device=GPU --enableFp16={enable_fp16}' os.system(f"./convert_fp32.sh {model_name}_fwd.mindir") - find_output_name(f'convv_{model_name}_fwd.mindir', f'{model_name}_output.txt') + find_output_name(f'convv_{model_name}_fwd.ms', f'{model_name}_output.txt') if server == 'local': os.system('./trc/release.sh x86') os.system(f"cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}" ) @@ -125,7 +130,7 @@ for line_model_arg in models_arg: os.system(f"ssh {server} 'cd {system}/.. && tar -xzf {system}/../mindspore-lite-{version}-linux-x64.tar.gz'") os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") - os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) + # os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) elif app=='trc': #if loop count =1 app=be else app = runtime diff --git a/trc/transformer/get_output_by_mindir.py b/trc/transformer/get_output_by_mindir.py old mode 100644 new mode 100755 diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index aa455b6e0a6..6446fefc967 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -11,20 +11,24 @@ #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert #-b 8 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert -#-b 16 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert +#-b 16 -l 24 -H 16 -S 1024 -s 128 -P 0 -m bert #-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer + + #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -H 4 -S 1024 -p 0 -m mha_x1 -#-b 1 -l 6 -s 128 -H 8 -S 1024 -m T5 +#-b 1 -l 2 -s 128 -H 2 -S 8 -m T5 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 4 -S 512 -s 128 -f 3072 -P 1 -m transformer_encoder_layer @@ -113,4 +117,4 @@ #-b 64 -l 12 -H 12 -S 768 -s 128 -m bert #-b 64 -l 24 -H 16 -S 1024 -s 128 -m bert --b 64 -l 24 -H 16 -S 1024 -s 512 -m bert +#-b 64 -l 24 -H 16 -S 1024 -s 512 -m bert diff --git a/trc/transformer/t.config b/trc/transformer/t.config index f26391171a8..0ecc92cc5ec 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,3 +1,4 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" -fusion_blacklists="EncoderLayerFusion", "DecoderLayerFusion" +#fusion_blacklists="EncoderLayerFusion" +#fusion_blacklists="DecoderLayerFusion" diff --git a/trc/transformer/test_tr.py b/trc/transformer/test_tr.py old mode 100644 new mode 100755 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 16a3b11bae2..c388c7c8211 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -328,12 +328,13 @@ def transformer_encoder_layer_create(): out_name='output1' print("name output:",out_name) saveCalib(out_name, np.array(y), f_y) - if compress: - f_cy=open(f'./{name}_compress_output.txt','w') - y_num = y.asnumpy() - y_num[:,actual_seq:,:] = 0 - y = M.Tensor.from_numpy(y_num) - saveCalib(out_name, np.array(y), f_cy) #2 dims + f_y.close() + # if compress: + # f_cy=open(f'./{name}_compress_output.txt','w') + # y_num = y.asnumpy() + # y_num[:,actual_seq:,:] = 0 + # y = M.Tensor.from_numpy(y_num) + # saveCalib(out_name, np.array(y), f_cy) #2 dims print("y.shape",np.array(y).shape) # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims @@ -341,6 +342,78 @@ def transformer_encoder_layer_create(): saveT(y, name + "_output1.fp" + suffix) +def transformer_encoder_layer_T5_create(): + post_layernorm=False + name = "transformer_encoder_layer_T5" + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, post_layernorm_residual=post_layernorm, has_bias=False) + encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + + # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + # k = model.attention.dense2.weight.asnumpy()#.transpose() + # v = model.attention.dense3.weight.asnumpy()#.transpose() + + # w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + # w = w.transpose() # hid_size x 3xhid_size + # wt = M.Tensor(w, w_compute_type) + # bq = model.attention.dense1.bias.asnumpy() + # bk = model.attention.dense2.bias.asnumpy() + # bv = model.attention.dense3.bias.asnumpy() + # bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + # bt =M.Tensor(bw, w_compute_type) + # wp = model.attention.projection.weight + # bp = model.attention.projection.bias + # omw = model.output.mapping.weight + # opw = model.output.projection.weight + # omb = model.output.mapping.bias + # opb = model.output.projection.bias + # gl1 = model.layernorm1.gamma + # bl1 = model.layernorm1.beta + # gl2 = model.layernorm2.gamma + # bl2 = model.layernorm2.beta + + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(encoder_input_value, name + "_input1.fp" + suffix) + saveT(encoder_input_mask, name + "_input2.fp" + suffix) + saveT(pos, name + "_input3.fp" + suffix) + # saveT(gl1, name + "_weight1.fp" + suffix) + # saveT(bl1, name + "_weight2.fp" + suffix) + # saveT(wt, name + "_weight3.fp" + suffix) + # saveT(bt, name + "_weight4.fp" + suffix) + # saveT(wp, name + "_weight5.fp" + suffix) + # saveT(bp, name + "_weight6.fp" + suffix) + # saveT(gl2, name + "_weight7.fp" + suffix) + # saveT(bl2, name + "_weight8.fp" + suffix) + # if ffn_fp16 == True: + # saveTensorToHalf(omw, name + "_weight9.fp" + "16") + # saveTensorToHalf(omb, name + "_weight10.fp" + "16") + # saveTensorToHalf(opw, name + "_weight11.fp" + "16") + # else: + # saveT(omw, name + "_weight9.fp" + suffix) + # saveT(omb, name + "_weight10.fp" + suffix) + # saveT(opw, name + "_weight11.fp" + suffix) + # saveT(opb, name + "_weight12.fp" + suffix) + _cell_graph_executor.compile(model, + encoder_input_value, + encoder_input_mask, + pos) + y = model(encoder_input_value, encoder_input_mask, position_bias = pos) + + export(model, encoder_input_value, encoder_input_mask, pos, file_name= name + "_fwd", file_format='MINDIR') + if app=="ch": + f_y=open(f'./{name}_output.txt','w') + out_name='output1' + print("name output:",out_name) + saveCalib(out_name, np.array(y), f_y) + print("y.shape",np.array(y).shape) + # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims + + elif app=="trc": + saveT(y, name + "_output1.fp" + suffix) + def transformer_decoder_layer_t5_create(): @@ -348,62 +421,63 @@ def transformer_decoder_layer_t5_create(): if (post_layernorm): print("post_layernorm true") model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False) + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False) else: print("post_layernorm false") model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False) + tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False) hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) encoder_pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) - q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size - k = model.attention.dense2.weight.asnumpy()#.transpose() - v = model.attention.dense3.weight.asnumpy()#.transpose() + + # encoder_pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + # k = model.attention.dense2.weight.asnumpy()#.transpose() + # v = model.attention.dense3.weight.asnumpy()#.transpose() - w = np.concatenate((q, k, v)) # 3xhid_size x hid_size - w = w.transpose() # hid_size x 3xhid_size - wt = M.Tensor(w, w_compute_type) - bq = model.attention.dense1.bias.asnumpy() - bk = model.attention.dense2.bias.asnumpy() - bv = model.attention.dense3.bias.asnumpy() - bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 - bt =M.Tensor(bw, w_compute_type) - print('encoder_output=',encoder_output) - wp = model.attention.projection.weight - bp = model.attention.projection.bias + # w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + # w = w.transpose() # hid_size x 3xhid_size + # wt = M.Tensor(w, w_compute_type) + # bq = model.attention.dense1.bias.asnumpy() + # bk = model.attention.dense2.bias.asnumpy() + # bv = model.attention.dense3.bias.asnumpy() + # bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + # bt =M.Tensor(bw, w_compute_type) + # print('encoder_output=',encoder_output) + # wp = model.attention.projection.weight + # bp = model.attention.projection.bias - qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size - k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() - v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() + # qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size + # k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() + # v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() - w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size - w2 = w.transpose() # hid_size x 3xhid_size - wt2 = M.Tensor(w2, w_compute_type) - bq2 = model.cross_attention.dense1.bias.asnumpy() - bk2 = model.cross_attention.dense2.bias.asnumpy() - bv2 = model.cross_attention.dense3.bias.asnumpy() - bw2 = np.concatenate((bq2, bk2, bv2)) #(3xhid) X 1 - bt2 =M.Tensor(bw2, w_compute_type) - wp2 = model.cross_attention.projection.weight - bp2 = model.cross_attention.projection.bias - omw = model.output.mapping.weight - opw = model.output.projection.weight - omb = model.output.mapping.bias - opb = model.output.projection.bias - - gl1 = model.layernorm1.gamma - bl1 = model.layernorm1.beta - gl2 = model.layernorm2.gamma - bl2 = model.layernorm2.beta - gl3 = model.cross_attention_layernorm.gamma - bl3 = model.cross_attention_layernorm.beta - suffix = str(compute_type) - suffix = suffix[-2:] + # w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size + # w2 = w.transpose() # hid_size x 3xhid_size + # wt2 = M.Tensor(w2, w_compute_type) + # bq2 = model.cross_attention.dense1.bias.asnumpy() + # bk2 = model.cross_attention.dense2.bias.asnumpy() + # bv2 = model.cross_attention.dense3.bias.asnumpy() + # bw2 = np.concatenate((bq2, bk2, bv2)) #(3xhid) X 1 + # bt2 =M.Tensor(bw2, w_compute_type) + # wp2 = model.cross_attention.projection.weight + # bp2 = model.cross_attention.projection.bias + # omw = model.output.mapping.weight + # opw = model.output.projection.weight + # omb = model.output.mapping.bias + # opb = model.output.projection.bias + + # gl1 = model.layernorm1.gamma + # bl1 = model.layernorm1.beta + # gl2 = model.layernorm2.gamma + # bl2 = model.layernorm2.beta + # gl3 = model.cross_attention_layernorm.gamma + # bl3 = model.cross_attention_layernorm.beta + # suffix = str(compute_type) + # suffix = suffix[-2:] - print('qt2=',qt2[0]) # saveT(gl1, name + "_weight1.fp" + suffix) # saveT(bl1, name + "_weight2.fp" + suffix) # saveT(wt, name + "_weight3.fp" + suffix) @@ -430,36 +504,16 @@ def transformer_decoder_layer_t5_create(): saveT(decoder_mask, name + "_input2.fp" + suffix) saveT(encoder_output, name + "_input3.fp" + suffix) saveT(memory_mask, name + "_input4.fp" + suffix) - - saveT(gl1, name + "_weight1.fp" + suffix) - saveT(bl1, name + "_weight2.fp" + suffix) - saveT(wt, name + "_weight3.fp" + suffix) - saveT(bt, name + "_weight4.fp" + suffix) - saveT(wp, name + "_weight5.fp" + suffix) - saveT(bp, name + "_weight6.fp" + suffix) - saveT(gl2, name + "_weight7.fp" + suffix) - saveT(bl2, name + "_weight8.fp" + suffix) - saveT(qt2, name + "_weight9.fp" + suffix) - saveT(wt2, name + "_weight10.fp" + suffix) - saveT(bt2, name + "_weight11.fp" + suffix) - saveT(wp2, name + "_weight12.fp" + suffix) - saveT(bp2, name + "_weight13.fp" + suffix) - saveT(gl3, name + "_weight14.fp" + suffix) - saveT(bl3, name + "_weight15.fp" + suffix) - if(ffn_fp16): - saveTensorToHalf(omw, name + "_weight16.fp" + "16") - saveTensorToHalf(omb, name + "_weight17.fp" + "16") - saveTensorToHalf(opw, name + "_weight18.fp" + "16") - else: - saveT(omw, name + "_weight16.fp" + suffix) - saveT(omb, name + "_weight17.fp" + suffix) - saveT(opw, name + "_weight18.fp" + suffix) - saveT(opb, name + "_weight19.fp" + suffix) - # # if app == 'trc': - # # saveTensorToHalf(omw, name + "_weight9.fp" + "16") - # # saveTensorToHalf(omb, name + "_weight10.fp" + "16") - # # saveTensorToHalf(opw, name + "_weight11.fp" + "16") - # # elif app == 'ch': + saveT(pos, name + "_input5.fp" + suffix) + saveT(encoder_pos, name + "_input6.fp" + suffix) + # saveT(gl1, name + "_weight1.fp" + suffix) + # saveT(bl1, name + "_weight2.fp" + suffix) + # saveT(wt, name + "_weight3.fp" + suffix) + # saveT(bt, name + "_weight4.fp" + suffix) + # saveT(wp, name + "_weight5.fp" + suffix) + # saveT(bp, name + "_weight6.fp" + suffix) + # saveT(gl2, name + "_weight7.fp" + suffix) + # saveT(bl2, name + "_weight8.fp" + suffix) # saveT(qt2, name + "_weight9.fp" + suffix) # saveT(wt2, name + "_weight10.fp" + suffix) # saveT(bt2, name + "_weight11.fp" + suffix) @@ -467,23 +521,22 @@ def transformer_decoder_layer_t5_create(): # saveT(bp2, name + "_weight13.fp" + suffix) # saveT(gl3, name + "_weight14.fp" + suffix) # saveT(bl3, name + "_weight15.fp" + suffix) - # saveT(omw, name + "_weight16.fp" + suffix) - # saveT(omb, name + "_weight17.fp" + suffix) - # saveT(opw, name + "_weight18.fp" + suffix) + # if(ffn_fp16): + # saveTensorToHalf(omw, name + "_weight16.fp" + "16") + # saveTensorToHalf(omb, name + "_weight17.fp" + "16") + # saveTensorToHalf(opw, name + "_weight18.fp" + "16") + # else: + # saveT(omw, name + "_weight16.fp" + suffix) + # saveT(omb, name + "_weight17.fp" + suffix) + # saveT(opw, name + "_weight18.fp" + suffix) # saveT(opb, name + "_weight19.fp" + suffix) - _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask)#, pos, encoder_pos) - y = model(hidden_stats, decoder_mask, encoder_output, memory_mask)#, position_bias=pos, encoder_decoder_position_bias = encoder_pos) - export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') - # if app=="ch": + + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , position_bias=pos, encoder_decoder_position_bias = encoder_pos) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos, file_name= name + "_fwd", file_format='MINDIR') f_y=open(f'./{name}_output.txt','w') - # # out_name=get_output_encoder_layer(name + "_fwd.mindir") - # # print("name output:",out_name) saveCalib("output1", np.array(y), f_y)#2 dims - # # print("y.shpae",np.array(y).shape) - # # saveCalib('Default/Add-op267', y, f_y)#2 dims f_y.close() - # # saveCalib('Default/Reshape-op296', np.array(y), f_y)#2 dims - # # elif app=="trc": saveT(y, name + "_output1.fp" + suffix) -- Gitee From 27d9eb39db49961a7b60435b986bfc541c8be587 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Tue, 3 Jan 2023 12:35:27 +0200 Subject: [PATCH 11/39] encoder & decoder T5 --- .../kernel/nnacl/infer/decoder_layer_infer.c | 1 - .../plugin/device/cpu/kernel/nnacl/op_base.h | 1 + .../delegate/tensorrt/op/decoder_tensorrt.cc | 23 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 16 +- .../optimizer/fusion/decoder_layer_fusion.cc | 224 +++++++++--------- .../optimizer/fusion/decoder_layer_fusion.h | 2 + .../optimizer/fusion/encoder_layer_fusion.cc | 44 ++-- .../optimizer/fusion/encoder_layer_fusion.h | 1 + trc/transformer/MultiHeadTester.py | 0 trc/transformer/ftBench.py | 2 +- trc/transformer/get_output_by_mindir.py | 0 trc/transformer/models.txt | 6 +- trc/transformer/t.config | 3 +- trc/transformer/test_tr.py | 0 trc/transformer/train_transformer_export.py | 192 ++++++++------- 15 files changed, 260 insertions(+), 255 deletions(-) mode change 100644 => 100755 trc/transformer/MultiHeadTester.py mode change 100644 => 100755 trc/transformer/get_output_by_mindir.py mode change 100644 => 100755 trc/transformer/test_tr.py diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c index f2f9ac344fe..24336c769da 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c @@ -20,7 +20,6 @@ int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size, OpParameter *parameter) { - printf("DecoderLayerInferShape\n" ); int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, C23NUM, C1NUM); if (check_ret != NNACL_OK) { return check_ret; diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h index 43f9f2c55fa..461af29199b 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h @@ -43,6 +43,7 @@ #define C14NUM 14 #define C15NUM 15 #define C16NUM 16 +#define C17NUM 17 #define C18NUM 18 #define C19NUM 19 #define C20NUM 20 diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 17e382435af..c00a4de9ebf 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -40,7 +40,7 @@ constexpr std::size_t kThree = 3; int DecoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { - if (in_tensors.size() != C23NUM) { + if (in_tensors.size() != C23NUM || in_tensors.size() != C16NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; } @@ -112,7 +112,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); params.ffn_fp16 = is_ffn_fp16_; params.cublas_handle=GetCublasHandle(); - params.projection_bias = true; + params.projection_bias = !decoder_op->get_position_bias1(); params.attn1.head_num = params.head_num; @@ -121,7 +121,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn1.position_bias = decoder_op->get_position_bias1(); params.attn1.qkv_bias = !params.attn1.position_bias; params.attn1.projection_bias = !params.attn1.position_bias; - params.attn1.is_cross = false; + params.attn1.is_cross = !params.attn1.position_bias; params.attn1.cublas_handle=GetCublasHandle(); params.attn2.head_num = params.head_num; @@ -135,8 +135,8 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { - size_t start_fp16 = C15NUM; - size_t end_fp16 = C19NUM; + size_t start_fp16 = (params.attn1.position_bias)? C14NUM : C19NUM; + size_t end_fp16 = (params.attn1.position_bias)? C17NUM :C23NUM for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { @@ -197,14 +197,11 @@ int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, params_.attn1.algo = algoId; params_.attn2.stream = stream; params_.attn2.algo = algoId; - void *inputs_forward[] = { - const_cast(inputs[0]), const_cast(inputs[1]), const_cast(inputs[2]), - const_cast(inputs[3]), const_cast(inputs[4]), const_cast(inputs[5]), - const_cast(inputs[6]), const_cast(inputs[7]), const_cast(inputs[8]), - const_cast(inputs[9]), const_cast(inputs[10]), const_cast(inputs[11]), - const_cast(inputs[12]), const_cast(inputs[13]), const_cast(inputs[14]), const_cast(inputs[15]), - const_cast(inputs[16]), const_cast(inputs[17]), const_cast(inputs[18]), - const_cast(inputs[19]),const_cast(inputs[20]), const_cast(inputs[21]), const_cast(inputs[22])}; + void *inputs_forward[]; + void *inputs_forward[num_of_inputs_]; + for (int i=0; i < num_of_inputs_; i++){ + inputs_forward[i]=const_cast(inputs[i]); + } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, workspace); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 5c3664b9465..514829b187f 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -41,7 +41,7 @@ constexpr std::size_t kThree = 3; // Multi Head Attention TensorRT op int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { - if (in_tensors.size() != C14NUM) { + if (in_tensors.size() != C14NUM || in_tensors.size() != C9NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; } @@ -120,6 +120,10 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; size_t end_fp16 = (params.layernorm_post) ? C11NUM : C13NUM; + if (params.position_bias) { + start_fp16 = C5NUM; + end_fp16 = C8NUM; + } for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { @@ -178,12 +182,10 @@ int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { params_.stream = stream; params_.algo = algoId; - void *inputs_forward[] = { - const_cast(inputs[0]), const_cast(inputs[1]), const_cast(inputs[2]), - const_cast(inputs[3]), const_cast(inputs[4]), const_cast(inputs[5]), - const_cast(inputs[6]), const_cast(inputs[7]), const_cast(inputs[8]), - const_cast(inputs[9]), const_cast(inputs[10]), const_cast(inputs[11]), - const_cast(inputs[12]), const_cast(inputs[13])}; + void *inputs_forward[num_of_inputs_]; + for (int i=0; i < num_of_inputs_; i++){ + inputs_forward[i]=const_cast(inputs[i]); + } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardEncoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, workspace); diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 6c61f05bb44..8c3baa3fb19 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -32,6 +32,7 @@ const auto &p1 = std::placeholders::_1; const size_t kWeightShapeSize = 2; const int kDecoderLayerOutputs = 1; } // namespace + bool DecoderLayerFusion::Init() const { hidden_stats_ = std::make_shared("input"); MS_CHECK_TRUE_RET(hidden_stats_ != nullptr, false); @@ -106,7 +107,7 @@ VectorRef DecoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); VectorRef layer_norm, tuple; - if (layernorm_fusion) { + if (!layernorm_fusion) { return DefineLayerNorm(reshape1, gamma1_, beta1_); } layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_}); @@ -144,44 +145,50 @@ VectorRef DecoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, Var VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, bool is_position_bias = false) const { - std::cout << "DefinePatternDecoderLayer post=" << post_layernorm << " layernorm_fusion=" << layernorm_fusion - << std::endl; - std::cout << "attention no position bias" << std::endl; auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); auto var1 = std::make_shared("var1-reshape"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); - auto attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, bias_attn_qkv_, bias_attn_o_, mask_}); + VectorRef attention, attention_cross, add2, tuple2, tuple3, layer_norm3, add3, reshape4, matmul2; + if (is_position_bias) { + attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, + weight_attn_o_, position_bias_,mask_}); + } else { + attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, + weight_attn_o_, bias_attn_qkv_, bias_attn_o_, mask_}); + } auto is_tuple4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item4"); auto var_tuple4 = std::make_shared("var_tuple4"); auto tuple4 = VectorRef({is_tuple4, attention, var_tuple4}); auto is_add2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add2"); - VectorRef add2, layer_norm2, tuple2, tuple3, layer_norm3, add3, reshape4; if (post_layernorm) { add2 = VectorRef({is_add2, getTuple(post_layernorm, layernorm_fusion, is_position_bias), tuple4}); } else { add2 = VectorRef({is_add2, reshape1, tuple4}); } if (layernorm_fusion) { - layer_norm2 = DefineLayerNorm(add2, gamma2_, beta2_); - tuple2 = layer_norm2; - } else { - layer_norm2 = VectorRef({is_layernorm2_, add2, gamma2_, beta2_}); + auto layer_norm2 = VectorRef({is_layernorm2_, add2, gamma2_, beta2_}); auto is_tuple2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item2"); auto var_tuple2 = std::make_shared("var_tuple2"); tuple2 = VectorRef({is_tuple2, layer_norm2, var_tuple2}); + } else { + tuple2 = DefineLayerNorm(add2, gamma2_, beta2_); } auto is_reshape2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder2"); MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {}); auto var2 = std::make_shared("var2"); MS_CHECK_TRUE_RET(var2 != nullptr, {}); auto reshape2 = VectorRef({is_reshape2, encoder_output_, var2}); - auto attention_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, - weight_attn_cross_o_, bias_attn_cross_qkv_, bias_attn_cross_o_, cross_mask_}); + if (is_position_bias) { + } else { + attention_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, + weight_attn_cross_o_, cross_mask_, position_bias_cross_}); + } auto is_tuple5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item5"); auto var_tuple5 = std::make_shared("var_tuple5"); auto tuple5 = VectorRef({is_tuple5, attention_cross, var_tuple5}); @@ -193,21 +200,26 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr add3 = VectorRef({is_add3, add2, tuple5}); } if (layernorm_fusion) { - layer_norm3 = DefineLayerNorm(add3, gamma3_, beta3_); - tuple3 = layer_norm3; - } else { - layer_norm3 = VectorRef({is_layernorm3_, add3, gamma3_, beta3_}); + auto layer_norm3 = VectorRef({is_layernorm3_, add3, gamma3_, beta3_}); auto is_tuple3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item3"); auto var_tuple3 = std::make_shared("var_tuple3"); tuple3 = VectorRef({is_tuple3, layer_norm3, var_tuple3}); + } else { + tuple3 = DefineLayerNorm(add3, gamma3_, beta3_); } auto is_matmul1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul1"); MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {}); - auto matmul1 = VectorRef({is_matmul1, tuple3, weight_m_, bias_m_}); - auto act = VectorRef({is_act_, matmul1}); auto is_matmul2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul2"); MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {}); - auto matmul2 = VectorRef({is_matmul2, act, weight_p_, bias_p_}); + if (!is_position_bias) { + auto matmul1 = VectorRef({is_matmul1, tuple3, weight_m_, bias_m_}); + auto act = VectorRef({is_act_, matmul1}); + matmul2 = VectorRef({is_matmul2, act, weight_p_, bias_p_}); + } else { + auto matmul1 = VectorRef({is_matmul1, tuple3, weight_m_}); + auto act = VectorRef({is_act_, matmul1}); + matmul2 = VectorRef({is_matmul2, act, weight_p_}); + } auto is_reshape3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder3"); MS_CHECK_TRUE_RET(is_reshape3 != nullptr, {}); auto var3 = std::make_shared("var3"); @@ -258,9 +270,9 @@ std::unordered_map DecoderLayerFusion::DefinePatterns() MS_LOG(ERROR) << "initial member failed."; return patterns; } - patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, false, false); + patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, true, false); patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, false, false); - // std::cout << "patterns[kPatternDecoderLayer]" << patterns[kPatternDecoderLayer].ToString() << std::endl; + patterns[kPatternDecoderT5] = DefinePatternDecoderLayer(false, true, true); return patterns; } @@ -269,8 +281,10 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - std::cout << "found pattern " << pattern_name << std::endl; - if (pattern_name == kPatternDecoderLayerPre) { + if (pattern_name == kPatternDecoderT5) { + is_position_bias_ = true; + } + if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderT5) { return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, false); } else if (pattern_name == kPatternDecoderLayerPost) { return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, true); @@ -313,10 +327,10 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi // return true; // } STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, - int *head_size, float *eps1, float *eps2, float *eps3, bool - *is_position_bias1, bool *is_position_bias2) const { - if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { - printf("is_attention_ is not AnfNodePtr"); + int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1, + bool *is_position_bias2) const { + if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { + MS_LOG(ERROR) << "is_attention_ is not AnfNodePtr"; return RET_ERROR; } AnfNodePtr node = utils::cast((*equiv)[is_attention_]); @@ -351,7 +365,7 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq *is_position_bias1 = attn_prim->get_position_bias(); } if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { - printf("is_attention_ is not AnfNodePtr"); + MS_LOG(ERROR) << "is_attention_ is not AnfNodePtr"; return RET_ERROR; } AnfNodePtr cross_node = utils::cast((*equiv)[is_attention_]); @@ -381,7 +395,7 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq } if ((*equiv)[is_layernorm1_] != nullptr) { if ((*equiv)[is_layernorm1_] == nullptr || !utils::isa((*equiv)[is_layernorm1_])) { - printf("is_layernorm1_ is not AnfNodePtr"); + MS_LOG(ERROR) << "is_layernorm1_ is not AnfNodePtr"; return RET_ERROR; } AnfNodePtr node_layrn1 = utils::cast((*equiv)[is_layernorm1_]); @@ -412,7 +426,7 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq } if ((*equiv)[is_layernorm2_] != nullptr) { if ((*equiv)[is_layernorm2_] == nullptr || !utils::isa((*equiv)[is_layernorm2_])) { - printf("is_layernorm2_ is not AnfNodePtr"); + MS_LOG(ERROR) << "is_layernorm2_ is not AnfNodePtr"; return RET_ERROR; } AnfNodePtr node_layrn2 = utils::cast((*equiv)[is_layernorm2_]); @@ -470,7 +484,7 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq // return false; // } return RET_OK; - } +} std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const { @@ -491,7 +505,7 @@ std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGrap return nullptr; } // add eps3 - decoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3,ffn_hidden_size, is_position_bias1, is_position_bias2, + decoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3, ffn_hidden_size, is_position_bias1, is_position_bias2, post_layernorm); return decoder_layer_prim; } @@ -499,73 +513,42 @@ std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGrap CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const AnfNodePtr &node, bool post_layernorm = true) const { - std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); // bool is_position_bias = false; auto input = utils::cast((*equiv)[hidden_stats_]); MS_ASSERT(input != nullptr); - std::cout << "input" << std::endl; auto encoder_output = utils::cast((*equiv)[encoder_output_]); MS_ASSERT(encoder_output != nullptr); - std::cout << "encoder_output" << std::endl; - AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3; + AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3, + bias_attn_cross_qkv, bias_attn_cross_o, position_bias_cross; auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); - MS_ASSERT(weight_qkv != nullptr); - bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); - bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); - MS_ASSERT(weight_qkv != nullptr); - std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); - MS_ASSERT(weight_attn_o != nullptr); - std::cout << "weight_attn_o" << std::endl; auto weight_attn_q = utils::cast((*equiv)[weight_attn_q_]); - MS_ASSERT(weight_attn_q != nullptr); auto weight_attn_kv = utils::cast((*equiv)[weight_attn_kv_]); - MS_ASSERT(weight_attn_kv != nullptr); auto weight_attn_cross_o = utils::cast((*equiv)[weight_attn_cross_o_]); - MS_ASSERT(weight_attn_cross_o != nullptr); - std::cout << "CreateMaskedDecoderLayerFusionNode" << std::endl; auto weight_m = utils::cast((*equiv)[weight_m_]); - MS_ASSERT(weight_m != nullptr); - std::cout << "weight_m" << std::endl; auto weight_p = utils::cast((*equiv)[weight_p_]); - MS_ASSERT(weight_p != nullptr); - std::cout << "weight_p" << std::endl; - auto bias_attn_cross_qkv = utils::cast((*equiv)[bias_attn_cross_qkv_]); - MS_ASSERT(bias_attn_cross_qkv != nullptr); - auto bias_attn_cross_o = utils::cast((*equiv)[bias_attn_cross_o_]); - MS_ASSERT(bias_attn_cross_o != nullptr); - bias_m = utils::cast((*equiv)[bias_m_]); - MS_ASSERT(bias_m != nullptr); - bias_p = utils::cast((*equiv)[bias_p_]); - MS_ASSERT(bias_p != nullptr); - beta1 = utils::cast((*equiv)[beta1_]); - MS_ASSERT(beta1 != nullptr); - std::cout << "beta1" << std::endl; - beta2 = utils::cast((*equiv)[beta2_]); - MS_ASSERT(beta2 != nullptr); - std::cout << "beta2" << std::endl; - beta3 = utils::cast((*equiv)[beta3_]); - MS_ASSERT(beta3 != nullptr); - std::cout << "beta3" << std::endl; + if (is_position_bias_) { + position_bias = utils::cast((*equiv)[position_bias_]); + position_bias_cross = utils::cast((*equiv)[position_bias_cross_]); + } else { + bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); + bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); + bias_attn_cross_qkv = utils::cast((*equiv)[bias_attn_cross_qkv_]); + bias_attn_cross_o = utils::cast((*equiv)[bias_attn_cross_o_]); + bias_m = utils::cast((*equiv)[bias_m_]); + bias_p = utils::cast((*equiv)[bias_p_]); + beta1 = utils::cast((*equiv)[beta1_]); + beta2 = utils::cast((*equiv)[beta2_]); + beta3 = utils::cast((*equiv)[beta3_]); + } auto gamma1 = utils::cast((*equiv)[gamma1_]); - MS_ASSERT(gamma1 != nullptr); - std::cout << "gamma1" << std::endl; auto gamma2 = utils::cast((*equiv)[gamma2_]); - MS_ASSERT(gamma2 != nullptr); - std::cout << "gamma2" << std::endl; auto gamma3 = utils::cast((*equiv)[gamma3_]); - MS_ASSERT(gamma3 != nullptr); - std::cout << "gamma3" << std::endl; - input_mask = utils::cast((*equiv)[mask_]); - MS_ASSERT(input_mask != nullptr); - std::cout << "input_mask" << std::endl; auto cross_mask = utils::cast((*equiv)[cross_mask_]); - MS_ASSERT(cross_mask != nullptr); - std::cout << "input_mask" << std::endl; auto base_shape_ptr = weight_m->Shape(); MS_EXCEPTION_IF_NULL(base_shape_ptr); auto input_shape_ptr = base_shape_ptr->cast(); @@ -573,47 +556,52 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP auto input_shape = input_shape_ptr->shape(); MS_ASSERT(input_shape != nullptr); int ffn_hidden_size = (int64_t)input_shape[1]; - std::cout << ffn_hidden_size << std::endl; auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size); MS_CHECK_TRUE_RET(decoder_layer_prim != nullptr, nullptr); auto decoder_layer_prim_c = decoder_layer_prim->GetPrim(); MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr); auto value_node = NewValueNode(decoder_layer_prim_c); MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); - std::cout << "value_node" << std::endl; - std::vector new_node_inputs = {value_node, - input, - gamma1, - beta1, - weight_qkv, - bias_attn_qkv, - input_mask, - weight_attn_o, - bias_attn_o, - gamma2, - beta2, - encoder_output, - weight_attn_q, - weight_attn_kv, - bias_attn_cross_qkv, - cross_mask, - weight_attn_cross_o, - bias_attn_cross_o, - gamma3, - beta3, - weight_m, - bias_m, - weight_p, - bias_p}; - - auto new_node = func_graph->NewCNode(new_node_inputs); - MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); - auto old_node = node->cast(); - MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); - new_node->set_abstract(old_node->abstract()->Clone()); - new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); - std::cout << new_node->ToString() << std::endl; - - return new_node; + std::vector new_node_inputs; + if (is_position_bias_) { + new_node_inputs = { value_node, input, gamma1, weight_qkv, position_bias, + input_mask, weight_attn_o, gamma2, encoder_output, weight_attn_q, weight_attn_kv, position_bias_cross, cross_mask, + weight_attn_cross_o, gamma3, weight_m, weight_p + }; } +else { + new_node_inputs = {value_node, + input, + gamma1, + beta1, + weight_qkv, + bias_attn_qkv, + input_mask, + weight_attn_o, + bias_attn_o, + gamma2, + beta2, + encoder_output, + weight_attn_q, + weight_attn_kv, + bias_attn_cross_qkv, + cross_mask, + weight_attn_cross_o, + bias_attn_cross_o, + gamma3, + beta3, + weight_m, + bias_m, + weight_p, + bias_p}; +} +auto new_node = func_graph->NewCNode(new_node_inputs); +MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); +auto old_node = node->cast(); +MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); +new_node->set_abstract(old_node->abstract()->Clone()); +new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); + +return new_node; +} // namespace mindspore::opt } // namespace mindspore::opt \ No newline at end of file diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h index a09b7b7823f..070b72eb49d 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -56,6 +56,7 @@ protected: protected: const std::string kPatternDecoderLayerPre = "PatternDecoderLayerPre"; const std::string kPatternDecoderLayerPost = "PatternDecoderLayerPost"; + const std::string kPatternDecoderT5 = "PatternDecoderT5"; mutable VarPtr hidden_stats_{nullptr}; mutable VarPtr encoder_output_{nullptr}; mutable VarPtr position_bias_{nullptr}; @@ -90,6 +91,7 @@ protected: mutable VarPtr is_layernorm2_{nullptr}; mutable VarPtr is_layernorm3_{nullptr}; mutable VarPtr is_act_{nullptr}; + mutable bool is_position_bias_{nullptr}; }; } // namespace opt } // namespace mindspore diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index c73d1da3d7f..0bb89fe03a7 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -83,7 +83,7 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio if (post_layernorm) { return reshape1; } - if (layernorm_fusion) { + if (!layernorm_fusion) { return DefineLayerNorm(is_position_bias, reshape1, gamma1_, beta1_); } auto layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_}); @@ -164,12 +164,12 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add"); auto add = VectorRef({is_add, reshape1, tuple}); if (layernorm_fusion) { - tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_); - } else { auto layer_norm2 = VectorRef({is_layernorm2_, add, gamma2_, beta2_}); auto is_tuple2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item2"); auto var_tuple2 = std::make_shared("var_tuple2"); tuple2 = VectorRef({is_tuple2, layer_norm2, var_tuple2}); + } else { + tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_); } auto is_reshape2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder2"); MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {}); @@ -180,7 +180,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr if (is_position_bias) { reshape2 = VectorRef({is_reshape2, add, var2}); matmul1 = VectorRef({is_matmul1, tuple2, weight_m_}); - } else if (post_layernorm || layernorm_fusion) { + } else if (post_layernorm || !layernorm_fusion) { reshape2 = VectorRef({is_reshape2, tuple2, var2}); matmul1 = VectorRef({is_matmul1, tuple2, weight_m_, bias_m_}); } else { @@ -199,7 +199,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto reshape3 = VectorRef({is_reshape3, matmul2, var3}); auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); auto add3 = VectorRef({is_add3, reshape2, reshape3}); - if (!post_layernorm || layernorm_fusion) { + if (!post_layernorm || !layernorm_fusion) { return add3; } auto is_reshape4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); @@ -208,12 +208,12 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr MS_CHECK_TRUE_RET(var4 != nullptr, {}); auto reshape4 = VectorRef({is_reshape4, add3, var4}); if (layernorm_fusion) { - tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_); - } else { auto layer_norm = VectorRef({is_layernorm1_, reshape4, gamma1_, beta1_}); auto is_tuple3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item3"); auto var_tuple3 = std::make_shared("var_tuple3"); tuple3 = VectorRef({is_tuple3, layer_norm, var_tuple3}); + } else { + tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_); } auto is_reshape5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); MS_CHECK_TRUE_RET(is_reshape5 != nullptr, {}); @@ -233,7 +233,7 @@ std::unordered_map EncoderLayerFusion::DefinePatterns() patterns[kPatternTEncoderLayerPost] = DefinePatternEncoderLayer(true); patterns[kPatternTEncoderLayerPostNorm] = DefinePatternEncoderLayer(true, true); patterns[kPatternTEncoderLayerPreNorm] = DefinePatternEncoderLayer(false, true); - patterns[kPatternEncoderLayerT5] = DefinePatternEncoderLayer(false, true, true); + patterns[kPatternEncoderLayerT5] = DefinePatternEncoderLayer(false, false, true); return patterns; } @@ -242,6 +242,8 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } + if (pattern_name == kPatternTEncoderLayerPostNorm || pattern_name == kPatternTEncoderLayerPreNorm) + is_layernorm_fusion_ = true; if (pattern_name == kPatternTEncoderLayerPost || pattern_name == kPatternTEncoderLayerPostNorm) { return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true); } else if (pattern_name == kPatternTEncoderLayerPre || pattern_name == kPatternTEncoderLayerPreNorm) { @@ -308,18 +310,22 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { is_position_bias_ = attn_prim->get_position_bias(); } - auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); - auto layrn1_prim = ops::GetOperator(layrn1_input); - if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) { - *eps1 = layrn1_prim->get_epsilon(); - } - auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_); - auto layrn2_prim = ops::GetOperator(layrn2_input); - if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { - *eps2 = layrn2_prim->get_epsilon(); + if (is_layernorm_fusion_) { + auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); + auto layrn1_prim = ops::GetOperator(layrn1_input); + if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) { + *eps1 = layrn1_prim->get_epsilon(); + } + auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_); + auto layrn2_prim = ops::GetOperator(layrn2_input); + if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { + *eps2 = layrn2_prim->get_epsilon(); + } } - if (!IsActGELU(func_graph, equiv, is_act_)) { - return RET_ERROR; + if (!is_position_bias_) { + if (!IsActGELU(func_graph, equiv, is_act_)) { + return RET_ERROR; + } } return RET_OK; } diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index 4f05e809d31..bef47951ee7 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -83,6 +83,7 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr is_layernorm1_{nullptr}; mutable VarPtr is_layernorm2_{nullptr}; mutable bool is_position_bias_{false}; + mutable bool is_layernorm_fusion_{false}; mutable VarPtr is_act_{nullptr}; }; } // namespace opt diff --git a/trc/transformer/MultiHeadTester.py b/trc/transformer/MultiHeadTester.py old mode 100644 new mode 100755 diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index f80c7c47d93..c376fc63678 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -125,7 +125,7 @@ for line_model_arg in models_arg: os.system(f"ssh {server} 'cd {system}/.. && tar -xzf {system}/../mindspore-lite-{version}-linux-x64.tar.gz'") os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") - os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) + # os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) elif app=='trc': #if loop count =1 app=be else app = runtime diff --git a/trc/transformer/get_output_by_mindir.py b/trc/transformer/get_output_by_mindir.py old mode 100644 new mode 100755 diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index dfbfe3a50c4..78d4825695e 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -10,9 +10,9 @@ #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert - --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer - +#-b 64 -l 24 -H 16 -S 1024 -s 512 -m bert +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert #-b 8 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert diff --git a/trc/transformer/t.config b/trc/transformer/t.config index f26391171a8..0ecc92cc5ec 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,3 +1,4 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" -fusion_blacklists="EncoderLayerFusion", "DecoderLayerFusion" +#fusion_blacklists="EncoderLayerFusion" +#fusion_blacklists="DecoderLayerFusion" diff --git a/trc/transformer/test_tr.py b/trc/transformer/test_tr.py old mode 100644 new mode 100755 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 7d4d1ef2c54..4625aa28edf 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -340,6 +340,78 @@ def transformer_encoder_layer_create(): elif app=="trc": saveT(y, name + "_output1.fp" + suffix) +def transformer_encoder_layer_t5_create(): + post_layernorm=False + name = "transformer_encoder_layer_T5" + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, post_layernorm_residual=post_layernorm, has_bias=False) + encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + + # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + # k = model.attention.dense2.weight.asnumpy()#.transpose() + # v = model.attention.dense3.weight.asnumpy()#.transpose() + + # w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + # w = w.transpose() # hid_size x 3xhid_size + # wt = M.Tensor(w, w_compute_type) + # bq = model.attention.dense1.bias.asnumpy() + # bk = model.attention.dense2.bias.asnumpy() + # bv = model.attention.dense3.bias.asnumpy() + # bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + # bt =M.Tensor(bw, w_compute_type) + # wp = model.attention.projection.weight + # bp = model.attention.projection.bias + # omw = model.output.mapping.weight + # opw = model.output.projection.weight + # omb = model.output.mapping.bias + # opb = model.output.projection.bias + # gl1 = model.layernorm1.gamma + # bl1 = model.layernorm1.beta + # gl2 = model.layernorm2.gamma + # bl2 = model.layernorm2.beta + + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(encoder_input_value, name + "_input1.fp" + suffix) + saveT(encoder_input_mask, name + "_input2.fp" + suffix) + saveT(pos, name + "_input3.fp" + suffix) + # saveT(gl1, name + "_weight1.fp" + suffix) + # saveT(bl1, name + "_weight2.fp" + suffix) + # saveT(wt, name + "_weight3.fp" + suffix) + # saveT(bt, name + "_weight4.fp" + suffix) + # saveT(wp, name + "_weight5.fp" + suffix) + # saveT(bp, name + "_weight6.fp" + suffix) + # saveT(gl2, name + "_weight7.fp" + suffix) + # saveT(bl2, name + "_weight8.fp" + suffix) + # if ffn_fp16 == True: + # saveTensorToHalf(omw, name + "_weight9.fp" + "16") + # saveTensorToHalf(omb, name + "_weight10.fp" + "16") + # saveTensorToHalf(opw, name + "_weight11.fp" + "16") + # else: + # saveT(omw, name + "_weight9.fp" + suffix) + # saveT(omb, name + "_weight10.fp" + suffix) + # saveT(opw, name + "_weight11.fp" + suffix) + # saveT(opb, name + "_weight12.fp" + suffix) + _cell_graph_executor.compile(model, + encoder_input_value, + encoder_input_mask, + pos) + y = model(encoder_input_value, encoder_input_mask, position_bias = pos) + + export(model, encoder_input_value, encoder_input_mask, pos, file_name= name + "_fwd", file_format='MINDIR') + if app=="ch": + f_y=open(f'./{name}_output.txt','w') + out_name='output1' + print("name output:",out_name) + saveCalib(out_name, np.array(y), f_y) + print("y.shape",np.array(y).shape) + # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims + + elif app=="trc": + saveT(y, name + "_output1.fp" + suffix) + @@ -348,17 +420,18 @@ def transformer_decoder_layer_t5_create(): if (post_layernorm): print("post_layernorm true") model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False) + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False) else: print("post_layernorm false") model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False) + tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False) hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) encoder_pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size k = model.attention.dense2.weight.asnumpy()#.transpose() v = model.attention.dense3.weight.asnumpy()#.transpose() @@ -366,12 +439,6 @@ def transformer_decoder_layer_t5_create(): w = np.concatenate((q, k, v)) # 3xhid_size x hid_size w = w.transpose() # hid_size x 3xhid_size wt = M.Tensor(w, w_compute_type) - bq = model.attention.dense1.bias.asnumpy() - bk = model.attention.dense2.bias.asnumpy() - bv = model.attention.dense3.bias.asnumpy() - bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 - bt =M.Tensor(bw, w_compute_type) - print('encoder_output=',encoder_output) wp = model.attention.projection.weight bp = model.attention.projection.bias @@ -382,111 +449,51 @@ def transformer_decoder_layer_t5_create(): w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size w2 = w.transpose() # hid_size x 3xhid_size wt2 = M.Tensor(w2, w_compute_type) - bq2 = model.cross_attention.dense1.bias.asnumpy() - bk2 = model.cross_attention.dense2.bias.asnumpy() - bv2 = model.cross_attention.dense3.bias.asnumpy() - bw2 = np.concatenate((bq2, bk2, bv2)) #(3xhid) X 1 - bt2 =M.Tensor(bw2, w_compute_type) wp2 = model.cross_attention.projection.weight - bp2 = model.cross_attention.projection.bias omw = model.output.mapping.weight opw = model.output.projection.weight - omb = model.output.mapping.bias - opb = model.output.projection.bias gl1 = model.layernorm1.gamma - bl1 = model.layernorm1.beta + # bl1 = model.layernorm1.beta gl2 = model.layernorm2.gamma - bl2 = model.layernorm2.beta + # bl2 = model.layernorm2.beta gl3 = model.cross_attention_layernorm.gamma - bl3 = model.cross_attention_layernorm.beta - suffix = str(compute_type) - suffix = suffix[-2:] + # bl3 = model.cross_attention_layernorm.beta + # suffix = str(compute_type) + # suffix = suffix[-2:] - print('qt2=',qt2[0]) - # saveT(gl1, name + "_weight1.fp" + suffix) + saveT(gl1, name + "_weight1.fp" + suffix) # saveT(bl1, name + "_weight2.fp" + suffix) - # saveT(wt, name + "_weight3.fp" + suffix) - # saveT(bt, name + "_weight4.fp" + suffix) - # saveT(wp, name + "_weight5.fp" + suffix) - # saveT(bp, name + "_weight6.fp" + suffix) - # saveT(gl2, name + "_weight7.fp" + suffix) + saveT(wt, name + "_weight2.fp" + suffix) + saveT(wp, name + "_weight3.fp" + suffix) + saveT(gl2, name + "_weight4.fp" + suffix) # saveT(bl2, name + "_weight8.fp" + suffix) - # saveT(qt2, name + "_weight9.fp" + suffix) - # saveT(wt2, name + "_weight10.fp" + suffix) - # saveT(bt2, name + "_weight11.fp" + suffix) - # saveT(wp2, name + "_weight12.fp" + suffix) - # saveT(bp2, name + "_weight13.fp" + suffix) - # saveT(gl3, name + "_weight14.fp" + suffix) - # saveT(bl3, name + "_weight15.fp" + suffix) - # saveT(omw, name + "_weight16.fp" + suffix) - # saveT(omb, name + "_weight17.fp" + suffix) - # saveT(opw, name + "_weight18.fp" + suffix) - # saveT(opb, name + "_weight19.fp" + suffix) - + saveT(qt2, name + "_weight5.fp" + suffix) + saveT(wt2, name + "_weight6.fp" + suffix) + saveT(wp2, name + "_weight7.fp" + suffix) + saveT(gl3, name + "_weight8.fp" + suffix) + if(ffn_fp16): + saveT(omw, name + "_weight9.fp" + "16") + saveT(opw, name + "_weight10.fp" + "16") + else: + saveT(omw, name + "_weight9.fp" + suffix) + saveT(opw, name + "_weight10.fp" + suffix) suffix = str(compute_type) suffix = suffix[-2:] saveT(hidden_stats, name + "_input1.fp" + suffix) saveT(decoder_mask, name + "_input2.fp" + suffix) saveT(encoder_output, name + "_input3.fp" + suffix) saveT(memory_mask, name + "_input4.fp" + suffix) - - saveT(gl1, name + "_weight1.fp" + suffix) - saveT(bl1, name + "_weight2.fp" + suffix) - saveT(wt, name + "_weight3.fp" + suffix) - saveT(bt, name + "_weight4.fp" + suffix) - saveT(wp, name + "_weight5.fp" + suffix) - saveT(bp, name + "_weight6.fp" + suffix) - saveT(gl2, name + "_weight7.fp" + suffix) - saveT(bl2, name + "_weight8.fp" + suffix) - saveT(qt2, name + "_weight9.fp" + suffix) - saveT(wt2, name + "_weight10.fp" + suffix) - saveT(bt2, name + "_weight11.fp" + suffix) - saveT(wp2, name + "_weight12.fp" + suffix) - saveT(bp2, name + "_weight13.fp" + suffix) - saveT(gl3, name + "_weight14.fp" + suffix) - saveT(bl3, name + "_weight15.fp" + suffix) - if(ffn_fp16): - saveTensorToHalf(omw, name + "_weight16.fp" + "16") - saveTensorToHalf(omb, name + "_weight17.fp" + "16") - saveTensorToHalf(opw, name + "_weight18.fp" + "16") - else: - saveT(omw, name + "_weight16.fp" + suffix) - saveT(omb, name + "_weight17.fp" + suffix) - saveT(opw, name + "_weight18.fp" + suffix) - saveT(opb, name + "_weight19.fp" + suffix) - # # if app == 'trc': - # # saveTensorToHalf(omw, name + "_weight9.fp" + "16") - # # saveTensorToHalf(omb, name + "_weight10.fp" + "16") - # # saveTensorToHalf(opw, name + "_weight11.fp" + "16") - # # elif app == 'ch': - # saveT(qt2, name + "_weight9.fp" + suffix) - # saveT(wt2, name + "_weight10.fp" + suffix) - # saveT(bt2, name + "_weight11.fp" + suffix) - # saveT(wp2, name + "_weight12.fp" + suffix) - # saveT(bp2, name + "_weight13.fp" + suffix) - # saveT(gl3, name + "_weight14.fp" + suffix) - # saveT(bl3, name + "_weight15.fp" + suffix) - # saveT(omw, name + "_weight16.fp" + suffix) - # saveT(omb, name + "_weight17.fp" + suffix) - # saveT(opw, name + "_weight18.fp" + suffix) - # saveT(opb, name + "_weight19.fp" + suffix) - _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask)#, pos, encoder_pos) - y = model(hidden_stats, decoder_mask, encoder_output, memory_mask)#, position_bias=pos, encoder_decoder_position_bias = encoder_pos) - export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') - # if app=="ch": + saveT(pos, name + "_input5.fp" + suffix) + saveT(encoder_pos, name + "_input6.fp" + suffix) + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , position_bias=pos, encoder_decoder_position_bias = encoder_pos) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos, file_name= name + "_fwd", file_format='MINDIR') f_y=open(f'./{name}_output.txt','w') - # # out_name=get_output_encoder_layer(name + "_fwd.mindir") - # # print("name output:",out_name) saveCalib("output1", np.array(y), f_y)#2 dims - # # print("y.shpae",np.array(y).shape) - # # saveCalib('Default/Add-op267', y, f_y)#2 dims f_y.close() - # # saveCalib('Default/Reshape-op296', np.array(y), f_y)#2 dims - # # elif app=="trc": saveT(y, name + "_output1.fp" + suffix) - def transformer_decoder_layer_create(): name = "transformer_decoder_layer" if (post_layernorm): @@ -1078,6 +1085,7 @@ def main(): for i in range(len(sys.argv)): if sys.argv[i]=='-m': model_name=sys.argv[i+1] + print("%s_create()" % model_name) eval("%s_create()" % model_name) if __name__ == "__main__": -- Gitee From 2628e9c85d5ee1abea191abaa5665f1c9f3b5053 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Tue, 3 Jan 2023 18:11:55 +0200 Subject: [PATCH 12/39] for merge --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 21 +- .../delegate/tensorrt/op/decoder_tensorrt.h | 2 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 1 - .../001-fast_transformer.patch | 1350 +++++++---------- trc/transformer/T5/transformer.py | 44 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 15 +- trc/transformer/models.txt | 4 +- trc/transformer/train_transformer_export.py | 63 +- 9 files changed, 618 insertions(+), 884 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 3dc2c8aa38f..849871b73a6 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -112,7 +112,6 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); params.ffn_fp16 = is_ffn_fp16_; params.cublas_handle=GetCublasHandle(); - params.projection_bias = true; params.attn1.head_num = params.head_num; @@ -120,7 +119,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn1.hidden_size = params.hidden_size; params.attn1.position_bias = decoder_op->get_position_bias1(); params.attn1.qkv_bias = !params.attn1.position_bias; - params.attn1.projection_bias = false; + params.attn1.projection_bias = true; params.attn1.is_cross = false; params.attn1.cublas_handle=GetCublasHandle(); @@ -129,14 +128,14 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.hidden_size = params.hidden_size; params.attn2.position_bias = decoder_op->get_position_bias2(); params.attn2.qkv_bias = !params.attn2.position_bias; - params.attn2.projection_bias = false; + params.attn2.projection_bias = true; params.attn2.is_cross = true; params.attn2.cublas_handle=GetCublasHandle(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { - size_t start_fp16 = C15NUM; - size_t end_fp16 = C19NUM; + size_t start_fp16 = C18NUM; + size_t end_fp16 = C22NUM; for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { @@ -197,14 +196,10 @@ int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, params_.attn1.algo = algoId; params_.attn2.stream = stream; params_.attn2.algo = algoId; - void *inputs_forward[] = { - const_cast(inputs[0]), const_cast(inputs[1]), const_cast(inputs[2]), - const_cast(inputs[3]), const_cast(inputs[4]), const_cast(inputs[5]), - const_cast(inputs[6]), const_cast(inputs[7]), const_cast(inputs[8]), - const_cast(inputs[9]), const_cast(inputs[10]), const_cast(inputs[11]), - const_cast(inputs[12]), const_cast(inputs[13]), const_cast(inputs[14]), const_cast(inputs[15]), - const_cast(inputs[16]), const_cast(inputs[17]), const_cast(inputs[18]), - const_cast(inputs[19]),const_cast(inputs[20]), const_cast(inputs[21]), const_cast(inputs[22])}; + void *inputs_forward[num_of_inputs_]; + for (int i=0; i < num_of_inputs_; i++){ + inputs_forward[i]=const_cast(inputs[i]); + } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, workspace); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index a7006edad2d..5060992be89 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -41,7 +41,7 @@ class DecoderTensorRT : public TensorRTOp { private: nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); - bool is_ffn_fp16_ = false; + bool is_ffn_fp16_ = true; }; constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"}; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index eebd08e7656..ccec39912b6 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -106,7 +106,6 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.ffn_hidden_size = encoder_op->get_ffn_hidden_size(); params.ffn_fp16 = is_ffn_fp16_; params.cublas_handle = GetCublasHandle(); - params.projection_bias = !encoder_op->get_position_bias(); params.hidden_size = params.head_num * params.head_size; params.attn.head_num = encoder_op->get_head_num(); diff --git a/third_party/patch/fast_transformer/001-fast_transformer.patch b/third_party/patch/fast_transformer/001-fast_transformer.patch index 05e23c55150..46273d0b49e 100644 --- a/third_party/patch/fast_transformer/001-fast_transformer.patch +++ b/third_party/patch/fast_transformer/001-fast_transformer.patch @@ -351,11 +351,7 @@ index a60983c..45b5374 100644 diff --git a/deploy.sh b/deploy.sh new file mode 100755 -<<<<<<< HEAD -index 0000000..0e60c1a -======= -index 0000000..ac54401 ->>>>>>> origin/bert +index 0000000..63c6473 --- /dev/null +++ b/deploy.sh @@ -0,0 +1,32 @@ @@ -459,10 +455,10 @@ index 0000000..33e562b +endif() diff --git a/examples/cpp/ms/initialize.h b/examples/cpp/ms/initialize.h new file mode 100644 -index 0000000..06ec2b2 +index 0000000..bde4c3f --- /dev/null +++ b/examples/cpp/ms/initialize.h -@@ -0,0 +1,788 @@ +@@ -0,0 +1,898 @@ +#pragma once + +#include "src/fastertransformer/layers/ms_layers/MSLayerWeight.h" @@ -530,6 +526,7 @@ index 0000000..06ec2b2 + MHA_T5_CROSS, // AttnIn + EncOut + AttnMAsk + position_bias + TEL, // transformer encoder layer + TDL, ++ TDL_T5, +} MODEL_TEST_ID_E; + +int ModelNum(std::string model_name) @@ -552,9 +549,12 @@ index 0000000..06ec2b2 + else if (model_name == "transformer_encoder_layer") { + return TEL; + } -+ else if (model_name == "transformer_decoder_layer" ||model_name == "transformer_decoder_layer_t5") { ++ else if (model_name == "transformer_decoder_layer") { + return TDL; + } ++ else if (model_name == "transformer_decoder_layer_t5") { ++ return TDL_T5; ++ } + else { + return -1; + } @@ -1099,6 +1099,97 @@ index 0000000..06ec2b2 + desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ + opt_a->hidden_size}, 0});//bp +} ++template ++void InitializeDecoderT5(opt_arg* opt_a, ++ DecriptorDecoderLayer& desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ cublasHandle_t* cublas_handle, ++ Allocator* allocator) ++{ ++ const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; ++ std::cout<<"hidden_units: "< ++ desc.Decoder = new MSDLayer(opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ opt_a->ffn_hidden_size, ++ opt_a->eps1, ++ opt_a->eps2, ++ opt_a->eps3, ++ opt_a->post_layernorm_residual, ++ opt_a->position_bias1, ++ opt_a->position_bias2, ++ opt_a->is_ffn_fp16, ++ stream, ++ cublas_wrapper, ++ cublas_handle, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false); // sparse ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU,getTensorType(),std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ desc.input_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ ++ ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU,getTensorType(),std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ desc.input_python_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); ++ ++ // desc.output_tensors.push_back(Tensor{ ++ // MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ // desc.output_python_tensors.push_back(Tensor{ ++ // MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); ++ ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{640/4}, 0}); ++ ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{640/4}, 0}); ++ ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0}); //G1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, 3 * hidden_units}, 0});//wt ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0});//wp ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//g1 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units , hidden_units * 2}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ hidden_units, hidden_units}, 0});//wp2 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size}, 0});//g3 ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});//wm ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{ ++ opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});;//wp ++} + +template +void Init(opt_arg* opt_a, @@ -1159,6 +1250,9 @@ index 0000000..06ec2b2 + case TDL: + InitializeDecoder(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator); + break; ++ case TDL_T5: ++ InitializeDecoderT5(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator); ++ break; + default: + break; + } @@ -1247,13 +1341,25 @@ index 0000000..06ec2b2 + decoder_weights.decoder_output_projection.kernel = (const T*)w_tensors[17].data; + decoder_weights.decoder_output_projection.bias = (const T*)w_tensors[18].data; + } ++ else if (modelId == TDL_T5) { ++ decoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data; ++ decoder_weights.attention.query_weight.kernel = (const T*)w_tensors[1].data; ++ decoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[2].data; ++ decoder_weights.layernorm2.gamma = (const T*)w_tensors[3].data; ++ decoder_weights.cross_attention.query_weight.kernel = (const T*)w_tensors[4].data; ++ decoder_weights.cross_attention.key_weight.kernel = (const T*)w_tensors[5].data; ++ decoder_weights.cross_attention.attention_output_weight.kernel = (const T*)w_tensors[6].data; ++ decoder_weights.layernorm3.gamma = (const T*)w_tensors[7].data; ++ decoder_weights.decoder_output_mapping.kernel = (const T*)w_tensors[8].data; ++ decoder_weights.decoder_output_projection.kernel = (const T*)w_tensors[9].data; ++ } + else { + // return ERROR illegal model ! + } +} diff --git a/examples/cpp/ms/ms.cc b/examples/cpp/ms/ms.cc new file mode 100644 -index 0000000..3f35c7b +index 0000000..e1d2daa --- /dev/null +++ b/examples/cpp/ms/ms.cc @@ -0,0 +1,670 @@ @@ -1796,7 +1902,7 @@ index 0000000..3f35c7b + desc.Encoder->forward(&desc.output_tensors, &desc.input_tensors, &encoder_weights); + + CompareOutput(desc.output_python_tensors, desc.output_tensors); -+#define DO_TIME ++// #define DO_TIME +#ifdef DO_TIME + // warmup + for (int i = 0; i < 10; i++) { @@ -2034,11 +2140,7 @@ index 7ff8e0f..e1be64c 100644 template void invokeAddBias(float* out, const float* bias, const int m, const int n, cudaStream_t stream); diff --git a/src/fastertransformer/kernels/add_residual_kernels.cu b/src/fastertransformer/kernels/add_residual_kernels.cu -<<<<<<< HEAD -index 4cd9f0f..7e4c4b3 100644 -======= -index 4cd9f0f..1bf2be3 100644 ->>>>>>> origin/bert +index 4cd9f0f..42c9216 100644 --- a/src/fastertransformer/kernels/add_residual_kernels.cu +++ b/src/fastertransformer/kernels/add_residual_kernels.cu @@ -29,6 +29,30 @@ __global__ void addBiasResidual(T* output, const T* input, const T* bias, const @@ -2104,11 +2206,7 @@ index 4cd9f0f..1bf2be3 100644 template __global__ void addBiasAttentionFfnResidual(T* block_output, const T* ffn_output, -<<<<<<< HEAD -@@ -88,12 +137,8 @@ void invokeAddBiasAttentionFfnResidual(T* block_output, -======= -@@ -88,11 +114,9 @@ void invokeAddBiasAttentionFfnResidual(T* block_output, ->>>>>>> origin/bert +@@ -88,11 +137,9 @@ void invokeAddBiasAttentionFfnResidual(T* block_output, } } @@ -2123,13 +2221,7 @@ index 4cd9f0f..1bf2be3 100644 #ifdef ENABLE_BF16 template void invokeAddBiasResidual(__nv_bfloat16* output, diff --git a/src/fastertransformer/kernels/add_residual_kernels.h b/src/fastertransformer/kernels/add_residual_kernels.h -<<<<<<< HEAD -index edd8179..75f26f9 100644 ---- a/src/fastertransformer/kernels/add_residual_kernels.h -+++ b/src/fastertransformer/kernels/add_residual_kernels.h -@@ -65,4 +65,11 @@ void invokeAddBiasResidualCol32(T* output, -======= -index edd8179..7ab8eb4 100644 +index edd8179..afa5a77 100644 --- a/src/fastertransformer/kernels/add_residual_kernels.h +++ b/src/fastertransformer/kernels/add_residual_kernels.h @@ -27,6 +27,9 @@ namespace fastertransformer { @@ -2142,8 +2234,7 @@ index edd8179..7ab8eb4 100644 template void invokeT5AddResidual(T* output, const T* input, const int m, const int n, cudaStream_t stream); -@@ -65,4 +68,8 @@ void invokeAddBiasResidualCol32(T* output, ->>>>>>> origin/bert +@@ -65,4 +68,11 @@ void invokeAddBiasResidualCol32(T* output, const float* input1_amax_ptr, const int scale_is_vector = 0); @@ -6569,334 +6660,107 @@ index 0000000..8908141 +// } // namespace fastertransformer diff --git a/src/fastertransformer/layers/decoder_layers/decoder.cc b/src/fastertransformer/layers/decoder_layers/decoder.cc new file mode 100644 -<<<<<<< HEAD index 0000000..4d65ec8 --- /dev/null +++ b/src/fastertransformer/layers/decoder_layers/decoder.cc @@ -0,0 +1,506 @@ -======= -index 0000000..004718e ---- /dev/null -+++ b/src/fastertransformer/layers/encoder_layers/encoder.cc -@@ -0,0 +1,814 @@ ->>>>>>> origin/bert + +#include "src/fastertransformer/layers/decoder_layers/decoder.h" +#include "src/fastertransformer/kernels/activation_kernels.h" +#include "src/fastertransformer/kernels/add_residual_kernels.h" -+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include "src/fastertransformer/layers/encoder_layers/encoder.h" + +#include -+ +namespace fastertransformer { + +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -+#define ALIGN(x, y) (UP_DIV(x, y) * (y)) ++// #define UP_DIV(x, y) (x) +#define ALIGN_SIZE 16 + +template -+void printTensor(const std::string& str, T* input, int size) -+{ -+ std::cout << str; ++void printTensor(char* str, T* input, int size) { ++ printf("%s ",str); + T* input_device = input; -+ auto input_host = std::make_unique(size); -+ cudaD2Hcpy(input_host.get(), input_device, size); -+ for (int k = 0, index = 0; k < size; k++) { -+ if (index != 0) -+ std::cout << ','; -+ std::cout << input_host[k]; -+ index++; -+ if (index == 10) { ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ ++ for (int k = 0; k < (int)size; k++) { ++ ++ std::cout << input_host[k] << ","; ++ if (k % 10 == 0) ++ std::cout << std::endl; ++ if (k % 10 == 0) + std::cout << std::endl; -+ index = 0; -+ } + } ++ + std::cout << std::endl; ++ ++ free(input_host); +} + +template +void isNan(char* str, T* input, int size) +{ -+ std::cout << str << " " -+ << " size is " << size; ++ std::cout << str << " " << " size is " << size; + T* input_device = input; + T* input_host = (T*)malloc(size * sizeof(T)); -+ cudaD2Hcpy(input_host, input_device, size); ++ ++ fastertransformer::cudaD2Hcpy(input_host, input_device, size); ++ + for (int k = 0; k < (int)size; k++) { + if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { + std::cout << "found NAN or INF"; + break; + } + } ++ + std::cout << std::endl; + free(input_host); +} -+template -+T checksum(const T* tensor, int size) -+{ -+ if constexpr (std::is_floating_point()) { -+ auto tensor_host = std::make_unique(size); -+ double sum = 0.; -+ T* ptr = tensor_host.get(); -+ cudaD2Hcpy(ptr, tensor, size); -+ for (int i = 0; i < size; i++) { -+ // sum += (double)ptr[i]*i; -+ sum += ptr[i]; -+ } -+ return static_cast(sum); -+ } -+ else -+ return static_cast(0.f); -+} + +template -+T checksumGrid(const T* tensor, const encoderParamT* param, bool zp = false, bool cross = false, bool ffn = false) ++size_t GetAttnWorkspaceSize(decoderParamT* param) +{ -+ if constexpr (std::is_floating_point()) { -+ int hidden_size; -+ if (ffn) { -+ hidden_size = param->ffn_hidden_size; -+ } -+ else { -+ hidden_size = param->hidden_size; -+ } -+ const int size = param->batch_size * param->src_seq_len * hidden_size; -+ int head_size = hidden_size / param->head_num; -+ auto tensor_host = std::make_unique(size); -+ double sum = 0.; -+ T* ptr = tensor_host.get(); -+ try { -+ cudaD2Hcpy(ptr, tensor, size); -+ } -+ catch (...) { -+ std::cout << "copy tensor failed" << std::endl; -+ return static_cast(0.f); -+ } -+ bool compressed = param->eft && zp; -+ if (!compressed) { -+ if (cross) { -+ std::cout << "cross sum:" << std::endl; -+ for (int i = 0; i < param->batch_size; i++) { -+ for (int j = 0; j < param->head_num; j++) { -+ for (int k = 0; k < param->src_seq_len / 2; k++) { -+ for (int l = 0; l < head_size; l++) { -+ sum += ptr[(((i * param->head_num) + j) * param->src_seq_len + k) * head_size + l]; -+ } -+ } -+ } -+ } -+ } -+ else { -+ std::cout << "grid sum:" << std::endl; -+ for (int i = 0; i < param->batch_size; i++) { -+ for (int j = 0; j < param->src_seq_len / 2; j++) { -+ for (int k = 0; k < hidden_size; k++) { -+ sum += ptr[((i * param->src_seq_len) + j) * hidden_size + k]; -+ } -+ } -+ } -+ } -+ } -+ else { -+ std::cout << "compress sum:" << std::endl; -+ for (int i = 0; i < param->h_token_num * hidden_size; i++) { -+ sum += ptr[i]; -+ } -+ } -+ return static_cast(sum); -+ } -+ else { -+ return static_cast(0.f); -+ } ++ size_t size_q = UP_DIV((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_k = UP_DIV((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_v = size_k; ++ size_t qkv_len = size_q + size_k + size_v; ++ size_t q_buf_2_len = size_q; ++ size_t qk_buf_len = ++ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t qkv_buf_3_len = qkv_buf_2_len; ++ size_t attn_out_size = ++ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; ++ return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T); ++ +} + ++template size_t GetAttnWorkspaceSize(decoderParamT* param); ++template size_t GetAttnWorkspaceSize(decoderParamT* param); +template -+void saveTensor(const std::string& name, T* tensor, int size) ++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param) +{ -+ auto tensor_host = std::make_unique(size); -+ T* ptr = tensor_host.get(); -+ cudaD2Hcpy(ptr, tensor, size); -+ std::ofstream wf(name + ".bin", std::ofstream::out | std::ofstream::binary); -+ wf.write(reinterpret_cast(ptr), size * sizeof(T)); -+ wf.close(); ++ size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ size_t attn2_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; ++ ++ size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); ++ size_t out_size = (param->layernorm_post) ? attn_out + attn2_out : attn_out * 2+ attn2_out * 2; ++ return (std::max(GetAttnWorkspaceSize(param) * 2, ffn_size * sizeof(T)) + out_size * sizeof(T)); +} + -<<<<<<< HEAD -+template -+size_t GetAttnWorkspaceSize(decoderParamT* param) -======= -+void CublasGemmWrapper(const void* a_addr, -+ const void* b_addr, -+ void* c_addr, -+ const int* params, -+ const int* lds, -+ const cublasOperation_t* operations, -+ const cudaDataType* data_types, -+ void* alpha, -+ void* beta, -+ cublasHandle_t cublas_handle, -+ cublasGemmAlgo_t algo) -+{ -+ const int m = params[0]; -+ const int n = params[1]; -+ const int k = params[2]; -+ cublasOperation_t trans_a = operations[0]; -+ cublasOperation_t trans_b = operations[1]; -+ const int lda = lds[0]; -+ const int ldb = lds[1]; -+ const int ldc = lds[2]; -+ cudaDataType type_a = data_types[0]; -+ cudaDataType type_b = data_types[1]; -+ cudaDataType type_c = data_types[2]; -+ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; -+ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { -+ compute_type = CUBLAS_COMPUTE_16F; -+ } -+ cublasGemmEx(cublas_handle, -+ trans_a, -+ trans_b, -+ m, -+ n, -+ k, -+ alpha, -+ a_addr, -+ type_a, -+ lda, -+ b_addr, -+ type_b, -+ ldb, -+ beta, -+ c_addr, -+ type_c, -+ ldc, -+ compute_type, -+ algo); -+} -+ -+void CublasGemmStridedBatchedWrapper(const void* a_addr, -+ const void* b_addr, -+ void* c_addr, -+ const int* params, -+ const int* lds, -+ const cublasOperation_t* operations, -+ const int* strides, -+ const cudaDataType* data_types, -+ void* alpha, -+ void* beta, -+ int batch, -+ cublasHandle_t cublas_handle, -+ cublasGemmAlgo_t algo) -+{ -+ const int m = params[0]; -+ const int n = params[1]; -+ const int k = params[2]; -+ cublasOperation_t trans_a = operations[0]; -+ cublasOperation_t trans_b = operations[1]; -+ const int lda = lds[0]; -+ const int ldb = lds[1]; -+ const int ldc = lds[2]; -+ cudaDataType type_a = data_types[0]; -+ cudaDataType type_b = data_types[1]; -+ cudaDataType type_c = data_types[2]; -+ cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; -+ // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -+ -+ if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { -+ compute_type = CUBLAS_COMPUTE_16F; -+ } -+ const int stride_a = strides[0]; -+ const int stride_b = strides[1]; -+ const int stride_c = strides[2]; -+ cublasGemmStridedBatchedEx(cublas_handle, -+ trans_a, -+ trans_b, -+ m, -+ n, -+ k, -+ alpha, -+ a_addr, -+ type_a, -+ lda, -+ stride_a, -+ b_addr, -+ type_b, -+ ldb, -+ stride_b, -+ beta, -+ c_addr, -+ type_c, -+ ldc, -+ stride_c, -+ batch, -+ compute_type, -+ algo); -+} -+ -+template -+size_t GetAttnWorkspaceSize(encoderParamT* param) ->>>>>>> origin/bert -+{ -+ size_t size_q = ALIGN((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE); -+ size_t size_k = ALIGN((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE); -+ size_t size_v = size_k; -+ size_t qkv_len = size_q + size_k + size_v; -<<<<<<< HEAD -+ size_t q_buf_2_len = size_q; -+ size_t qk_buf_len = -+ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t qkv_buf_3_len = qkv_buf_2_len; -+ size_t attn_out_size = -+ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; -+ return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T); -+ -======= -+ size_t qk_buf_len = -+ ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE); -+ size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ size_t attn_out_size = -+ ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE); -+ return (qkv_buf_2_len + 2 * attn_out_size + std::max(qkv_len, qk_buf_len)) * sizeof(T); ->>>>>>> origin/bert -+} -+ -+template size_t GetAttnWorkspaceSize(decoderParamT* param); -+template size_t GetAttnWorkspaceSize(decoderParamT* param); -+template -+size_t GetDecoderLayerWorkspaceSize(decoderParamT* param) -+{ -<<<<<<< HEAD -+ size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; -+ size_t attn2_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; -+ -+ size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); -+ size_t out_size = (param->layernorm_post) ? attn_out + attn2_out : attn_out * 2+ attn2_out * 2; -+ return (std::max(GetAttnWorkspaceSize(param) * 2, ffn_size * sizeof(T)) + out_size * sizeof(T)); -======= -+ size_t max_hidden = ALIGN(std::max(param->hidden_size, param->ffn_hidden_size),ALIGN_SIZE); -+ size_t compress_buffer_len = ALIGN(param->batch_size * param->src_seq_len * max_hidden,ALIGN_SIZE); -+ size_t padding_len = ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE); -+ size_t offset_len = ALIGN(param->batch_size,ALIGN_SIZE); -+ size_t d_token_len = ALIGN(1,ALIGN_SIZE); -+ size_t eft_size = compress_buffer_len * sizeof(T) + (padding_len + offset_len) * sizeof(int) + d_token_len * sizeof(size_t); -+ size_t attn_out = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ size_t ffn = ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE); -+ return (std::max(GetAttnWorkspaceSize(param), ffn * sizeof(T)) + (attn_out * 3) * sizeof(T)) + eft_size; ->>>>>>> origin/bert -+} -+ -+template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); -+template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); -+ -+template -+void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws) ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++template size_t GetDecoderLayerWorkspaceSize(decoderParamT* param); ++ ++template ++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws) +{ + size_t inter_size = param->ffn_hidden_size; -+ size_t h_token_num = param->h_token_num; ++ size_t h_token_num = param->batch_size * param->src_seq_len; + cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; + cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; + if ((std::is_same::value) || (std::is_same::value)) { @@ -6950,47 +6814,8 @@ index 0000000..004718e +{ + param->in_idx = 0; + size_t h_token_num = param->batch_size * param->src_seq_len; -+ param->h_token_num = h_token_num; -+ param->padding_offset = nullptr; -+ int* d_sequence_lengths = nullptr; -+ T* input_tensor = reinterpret_cast(inputs[param->in_idx++]); -+ T* from_tensor = input_tensor; -+ T* compress_buffer; -+ compress_buffer = reinterpret_cast(ws); -+ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(h_token_num * param->hidden_size,ALIGN_SIZE)); -+ int* padding_offset = reinterpret_cast(ws); -+ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE)); -+ d_sequence_lengths = reinterpret_cast(ws); -+ param->d_sequence_length = d_sequence_lengths; -+ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(param->batch_size,ALIGN_SIZE)); -+ size_t* d_token_num = reinterpret_cast(ws); -+ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(1,ALIGN_SIZE)); -+ invokeBuildSequnceLength( -+ from_tensor, param->batch_size, d_sequence_lengths, param->src_seq_len, param->hidden_size, param->stream); -+ // printTensor("seq_len=",d_sequence_lengths,param->batch_size); -+ invokeGetPaddingOffset(&h_token_num, -+ d_token_num, -+ padding_offset, -+ d_sequence_lengths, -+ param->batch_size, -+ param->src_seq_len, -+ param->stream); -+ // std::cout << "token=" << h_token_num << "m=" << param->batch_size * param->src_seq_len << std::endl; -+ if (h_token_num * 2 <= param->batch_size * param->src_seq_len) { -+ param->eft = true; -+ invokeRemovePadding(compress_buffer, -+ (const T*)from_tensor, -+ padding_offset, -+ h_token_num, -+ param->head_num * param->head_size, -+ param->stream); -+ param->h_token_num = h_token_num; -+ param->padding_offset = padding_offset; -+ from_tensor = compress_buffer; -+ } -+ h_token_num = param->h_token_num; ++ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); + T* attn_out = reinterpret_cast(ws); -<<<<<<< HEAD + T* normed_from_tensor = reinterpret_cast(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; + T* attn_ws = reinterpret_cast(normed_from_tensor) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; + T* normed_attn_out = normed_from_tensor; @@ -6999,23 +6824,9 @@ index 0000000..004718e + T* attn2_ws = reinterpret_cast(normed_from_tensor2) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; + T* normed_attn2_out = normed_from_tensor2; + T* ffn_ws = normed_attn2_out + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -======= -+ T* normed_from_tensor = -+ reinterpret_cast(ws) + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ T* attn_ws_offset = (param->layernorm_post) ? reinterpret_cast(ws) : reinterpret_cast(normed_from_tensor); -+ T* attn_ws = attn_ws_offset + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ T* normed_attn_out = normed_from_tensor; -+ T* ffn_ws = normed_attn_out + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ ->>>>>>> origin/bert + T* tmp_out = reinterpret_cast(output[0]); -+ if (param->padding_offset != nullptr || (std::is_same::value && param->ffn_fp16 == true)) { -+ tmp_out = ffn_ws + ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE); -+ } -+ T* tmp_out1 = reinterpret_cast(output[0]); -+ T* out_buf = tmp_out; -+ if (param->padding_offset != nullptr) { -+ tmp_out1 = compress_buffer; ++ if (std::is_same::value && param->ffn_fp16==true) { ++ tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; + } + T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); + T* beta1 = reinterpret_cast(inputs[param->in_idx++]); @@ -7036,7 +6847,6 @@ index 0000000..004718e + T* projection_bias = reinterpret_cast(inputs[param->in_idx++]); + T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); + T* beta2 = reinterpret_cast(inputs[param->in_idx++]); -<<<<<<< HEAD + from_tensor = param->layernorm_post ? normed_from_tensor : from_tensor; + invokeGeneralAddBiasResidualPreLayerNorm(attn_out, + normed_attn_out, @@ -7082,79 +6892,16 @@ index 0000000..004718e + param->hidden_size, + param->stream, + param->eps3); -======= -+ if (param->layernorm_post == false) { -+ if (std::is_same::value || param->ffn_fp16 == false) { -+ invokeGeneralAddBiasResidualPreLayerNorm(attn_out, -+ normed_attn_out, -+ from_tensor, -+ gamma2, // gamma -+ beta2, // beta -+ projection_bias, -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ } -+ else { -+ invokeGeneralAddBiasResidualPreLayerNormCast(attn_out, -+ reinterpret_cast(normed_attn_out), -+ from_tensor, -+ gamma2, // gamma -+ beta2, // beta -+ projection_bias, -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ } -+ } -+ else { -+ if (std::is_same::value || param->ffn_fp16 == false) { -+ invokeAddBiasResidualLayerNorm(attn_out, -+ from_tensor, -+ projection_bias, -+ gamma2, // gamma -+ beta2, // beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps1); -+ normed_attn_out = attn_out; -+ } -+ else { -+ invokeAddBiasResidualLayerNormCast(reinterpret_cast(attn_out), -+ reinterpret_cast(normed_attn_out), -+ reinterpret_cast(from_tensor), -+ projection_bias, -+ gamma2, // gamma -+ beta2, // beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps1); -+ // isNan((char*)"LN 1 model", (half*)attn_out, h_token_num * param->hidden_size); -+ } ->>>>>>> origin/bert + } + } else { + // without projection bias + } -<<<<<<< HEAD + inputs[--param->in_idx] = normed_attn2_out; + if (param->ffn_fp16==false) { -======= -+ // forward ffn -+ // simulate attention inputs -+ inputs[--param->in_idx] = normed_attn_out; -+ if (param->ffn_fp16 == false) { ->>>>>>> origin/bert + forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); -+ } -+ else { ++ } else { + forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); + } -<<<<<<< HEAD + attn2_out = param->layernorm_post ? normed_attn2_out : attn2_out; + if (std::is_same::value || param->ffn_fp16==false) { + invokeAddBiasResidual(reinterpret_cast(tmp_out), @@ -7182,62 +6929,6 @@ index 0000000..004718e + param->stream); + } + } -======= -+ if (param->layernorm_post == true) { -+ if (std::is_same::value || param->ffn_fp16 == false) { -+ invokeAddBiasResidualLayerNorm(reinterpret_cast(tmp_out), -+ attn_out, -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, -+ reinterpret_cast(inputs[param->in_idx++]), // Gamma -+ reinterpret_cast(inputs[param->in_idx++]), // Beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ } -+ else { -+ invokeAddBiasResidualLayerNormCast( -+ reinterpret_cast(tmp_out), -+ reinterpret_cast(tmp_out1), -+ reinterpret_cast(normed_attn_out), -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, -+ reinterpret_cast(inputs[param->in_idx++]), // Gamma -+ reinterpret_cast(inputs[param->in_idx++]), // Beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ out_buf = tmp_out1; -+ } -+ } -+ else { -+ if (std::is_same::value || param->ffn_fp16 == false) { -+ invokeAddBiasResidual(reinterpret_cast(tmp_out), -+ attn_out, -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias -+ h_token_num, -+ param->hidden_size, -+ param->stream); -+ } -+ else { -+ invokeAddBiasResidualCast(reinterpret_cast(tmp_out), -+ reinterpret_cast(attn_out), -+ reinterpret_cast(tmp_out1), -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias -+ h_token_num, -+ param->hidden_size, -+ param->stream); -+ } -+ } -+ if (param->padding_offset != nullptr) { -+ cudaMemsetAsync(output[0], -+ 0, -+ param->batch_size * param->src_seq_len * param->head_size * param->head_num * sizeof(T), -+ param->stream); -+ invokeRebuildPadding( -+ (T*)output[0], out_buf, param->padding_offset, h_token_num, param->hidden_size, param->stream); -+ } ->>>>>>> origin/bert + return; +} + @@ -7251,7 +6942,6 @@ index 0000000..004718e +{ + param->in_idx = 0; + auto extra_tmp_size = -<<<<<<< HEAD + UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; + size_t size_q = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; + size_t size_k = UP_DIV(param->batch_size * param->tgt_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; @@ -7272,22 +6962,6 @@ index 0000000..004718e + T* output2 = static_cast(output1) + extra_tmp_size; + int gemm_dims[] = { + 3 * (int)param->hidden_size, (int)param->batch_size * (int)param->src_seq_len, (int)param->hidden_size}; -======= -+ ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE); -+ size_t size_q = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ size_t q_buf_2_len = size_q; -+ size_t qk_buf_len = -+ ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE); -+ size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); -+ T* q_buf_2 = (T*)ws; -+ T* output1 = static_cast(ws) + q_buf_2_len; -+ T* output2 = static_cast(output1) + extra_tmp_size; -+ T* qkv_buf = static_cast(output2) + extra_tmp_size; -+ T* qk_buf = qkv_buf; -+ T* qkv_buf_2 = q_buf_2; -+ T* qkv_buf_3 = qk_buf; -+ int gemm_dims[] = {3 * (int)param->hidden_size, (int)param->h_token_num, (int)param->hidden_size}; ->>>>>>> origin/bert + int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size}; + T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); + cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; @@ -7361,7 +7035,6 @@ index 0000000..004718e + &beta, + param->cublas_handle, + param->algo); -<<<<<<< HEAD + + T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; + fastertransformer::invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), @@ -7376,37 +7049,6 @@ index 0000000..004718e + 0, + param->stream); + -======= -+ -+ T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; -+ if (param->padding_offset == nullptr) { -+ invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), -+ static_cast(output1), -+ static_cast(output2), -+ static_cast(qkv_buf), -+ bias_qkv, -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ 0, -+ param->stream); -+ } -+ else { -+ invokeAddFusedZP_QKVBiasTranspose(static_cast(q_buf_2), -+ static_cast(output1), -+ static_cast(output2), -+ static_cast(qkv_buf), -+ bias_qkv, -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ param->h_token_num, -+ param->padding_offset, -+ param->stream); -+ } ->>>>>>> origin/bert + } + gemm_ops[0] = CUBLAS_OP_T; + @@ -7437,15 +7079,11 @@ index 0000000..004718e + param->algo); + + T* attention_mask = reinterpret_cast(inputs[param->in_idx++]); -+ if (param->padding_offset != nullptr) -+ invokeBuildEncoderAttentionMask( -+ attention_mask, param->d_sequence_length, param->batch_size, param->src_seq_len, param->stream); + T* position_bias = nullptr; + if (param->position_bias) { + position_bias = reinterpret_cast(inputs[param->in_idx++]); + } + T scalar = static_cast(1.0f / sqrtf(param->head_size * 1.0f)); -<<<<<<< HEAD + fastertransformer::invokeMixMaskedSoftMax(static_cast(qk_buf), + attention_mask, + position_bias, @@ -7455,17 +7093,6 @@ index 0000000..004718e + param->head_num, + scalar, + param->stream); -======= -+ invokeMixMaskedSoftMax(static_cast(qk_buf), -+ attention_mask, -+ position_bias, -+ param->batch_size, -+ param->src_seq_len, -+ param->tgt_seq_len, -+ param->head_num, -+ scalar, -+ param->stream); ->>>>>>> origin/bert + + gemm_ops[0] = CUBLAS_OP_N; + gemm_ops[1] = CUBLAS_OP_N; @@ -7480,12 +7107,7 @@ index 0000000..004718e + gemm_strides[0] = param->tgt_seq_len * param->head_size; + gemm_strides[1] = param->src_seq_len * param->tgt_seq_len; + gemm_strides[2] = param->src_seq_len * param->head_size; -<<<<<<< HEAD + fastertransformer::CublasGemmStridedBatchedWrapper(output2, -======= -+ -+ CublasGemmStridedBatchedWrapper(output2, ->>>>>>> origin/bert + qk_buf, + qkv_buf_2, + gemm_dims, @@ -7498,7 +7120,6 @@ index 0000000..004718e + param->batch_size * param->head_num, + param->cublas_handle, + param->algo); -<<<<<<< HEAD + invokeTransposeQKV(static_cast(qkv_buf_3), + static_cast(qkv_buf_2), + param->batch_size, @@ -7507,33 +7128,10 @@ index 0000000..004718e + param->head_size, + param->stream); + -======= -+ -+ if (param->padding_offset == nullptr) { -+ invokeTransposeQKV(static_cast(qkv_buf_3), -+ static_cast(qkv_buf_2), -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ param->stream); -+ } -+ else { -+ invokeTransposeAttentionOutRemovePadding(qkv_buf_2, -+ qkv_buf_3, -+ param->h_token_num, -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ param->padding_offset, -+ param->stream); -+ } ->>>>>>> origin/bert + gemm_ops[0] = CUBLAS_OP_N; + gemm_ops[1] = CUBLAS_OP_N; + gemm_dims[0] = param->hidden_size; -+ gemm_dims[1] = param->h_token_num; ++ gemm_dims[1] = param->batch_size * param->src_seq_len; + gemm_dims[2] = param->hidden_size; + + gemm_lds[0] = param->hidden_size; @@ -7553,7 +7151,7 @@ index 0000000..004718e + param->algo); + + if (param->projection_bias) { -+ int len = param->h_token_num; ++ int len = param->batch_size * param->src_seq_len; + invokeAddBias( + static_cast(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream); + } @@ -7574,17 +7172,10 @@ index 0000000..004718e +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/decoder_layers/decoder.h b/src/fastertransformer/layers/decoder_layers/decoder.h new file mode 100644 -<<<<<<< HEAD index 0000000..c302ea8 --- /dev/null +++ b/src/fastertransformer/layers/decoder_layers/decoder.h @@ -0,0 +1,112 @@ -======= -index 0000000..ffba081 ---- /dev/null -+++ b/src/fastertransformer/layers/encoder_layers/encoder.h -@@ -0,0 +1,49 @@ ->>>>>>> origin/bert +#pragma once + +#include "src/fastertransformer/kernels/activation_kernels.h" @@ -7594,7 +7185,6 @@ index 0000000..ffba081 + +namespace fastertransformer { + -<<<<<<< HEAD +// typedef struct { +// size_t batch_size; +// size_t src_seq_len; @@ -7719,35 +7309,6 @@ index 0000000..3b43391 + * See the License for the specific language governing permissions and + * limitations under the License. + */ -======= -+typedef struct { -+ size_t batch_size; -+ size_t src_seq_len; -+ size_t tgt_seq_len; -+ size_t head_num; -+ size_t head_size; -+ size_t hidden_size; -+ size_t h_token_num; -+ size_t ffn_hidden_size; // 4 * param->hidden_size; -+ bool ffn_fp16; -+ float eps1; -+ float eps2; -+ // handle -+ cublasHandle_t cublas_handle; -+ cudaStream_t stream; -+ cublasGemmAlgo_t algo; -+ // ctrls -+ int in_idx; -+ bool qkv_bias; // true -+ bool projection_bias; // true -+ bool is_cross; // false -+ bool position_bias; // false -+ bool layernorm_post; // dont care -+ bool eft; // false - effective fast trn -+ int *padding_offset; -+ int *d_sequence_length; -+} encoderParamT; ->>>>>>> origin/bert + +#pragma once + @@ -8124,64 +7685,151 @@ index 0000000..af2a82c +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/encoder_layers/encoder.cc b/src/fastertransformer/layers/encoder_layers/encoder.cc new file mode 100644 -index 0000000..45d4dde +index 0000000..c0b4f37 --- /dev/null +++ b/src/fastertransformer/layers/encoder_layers/encoder.cc -@@ -0,0 +1,646 @@ +@@ -0,0 +1,815 @@ + +#include "src/fastertransformer/layers/encoder_layers/encoder.h" +#include "src/fastertransformer/kernels/activation_kernels.h" +#include "src/fastertransformer/kernels/add_residual_kernels.h" ++#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include ++ +namespace fastertransformer { + +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -+// #define UP_DIV(x, y) (x) ++#define ALIGN(x, y) (UP_DIV(x, y) * (y)) +#define ALIGN_SIZE 16 + +template -+void printTensor(char* str, T* input, int size) { -+ printf("%s ",str); ++void printTensor(const std::string& str, T* input, int size) ++{ ++ std::cout << str; + T* input_device = input; -+ T* input_host = (T*)malloc(size * sizeof(T)); -+ -+ fastertransformer::cudaD2Hcpy(input_host, input_device, size); -+ -+ for (int k = 0; k < (int)size; k++) { -+ -+ std::cout << input_host[k] << ","; -+ if (k % 10 == 0) -+ std::cout << std::endl; -+ if (k % 10 == 0) ++ auto input_host = std::make_unique(size); ++ cudaD2Hcpy(input_host.get(), input_device, size); ++ for (int k = 0, index = 0; k < size; k++) { ++ if (index != 0) ++ std::cout << ','; ++ std::cout << input_host[k]; ++ index++; ++ if (index == 10) { + std::cout << std::endl; ++ index = 0; ++ } + } -+ + std::cout << std::endl; -+ -+ free(input_host); +} + +template +void isNan(char* str, T* input, int size) +{ -+ std::cout << str << " " << " size is " << size; ++ std::cout << str << " " ++ << " size is " << size; + T* input_device = input; + T* input_host = (T*)malloc(size * sizeof(T)); -+ -+ fastertransformer::cudaD2Hcpy(input_host, input_device, size); -+ ++ cudaD2Hcpy(input_host, input_device, size); + for (int k = 0; k < (int)size; k++) { + if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) { + std::cout << "found NAN or INF"; + break; + } + } -+ + std::cout << std::endl; + free(input_host); +} ++template ++T checksum(const T* tensor, int size) ++{ ++ if constexpr (std::is_floating_point()) { ++ auto tensor_host = std::make_unique(size); ++ double sum = 0.; ++ T* ptr = tensor_host.get(); ++ cudaD2Hcpy(ptr, tensor, size); ++ for (int i = 0; i < size; i++) { ++ // sum += (double)ptr[i]*i; ++ sum += ptr[i]; ++ } ++ return static_cast(sum); ++ } ++ else ++ return static_cast(0.f); ++} ++ ++template ++T checksumGrid(const T* tensor, const encoderParamT* param, bool zp = false, bool cross = false, bool ffn = false) ++{ ++ if constexpr (std::is_floating_point()) { ++ int hidden_size; ++ if (ffn) { ++ hidden_size = param->ffn_hidden_size; ++ } ++ else { ++ hidden_size = param->hidden_size; ++ } ++ const int size = param->batch_size * param->src_seq_len * hidden_size; ++ int head_size = hidden_size / param->head_num; ++ auto tensor_host = std::make_unique(size); ++ double sum = 0.; ++ T* ptr = tensor_host.get(); ++ try { ++ cudaD2Hcpy(ptr, tensor, size); ++ } ++ catch (...) { ++ std::cout << "copy tensor failed" << std::endl; ++ return static_cast(0.f); ++ } ++ bool compressed = param->eft && zp; ++ if (!compressed) { ++ if (cross) { ++ std::cout << "cross sum:" << std::endl; ++ for (int i = 0; i < param->batch_size; i++) { ++ for (int j = 0; j < param->head_num; j++) { ++ for (int k = 0; k < param->src_seq_len / 2; k++) { ++ for (int l = 0; l < head_size; l++) { ++ sum += ptr[(((i * param->head_num) + j) * param->src_seq_len + k) * head_size + l]; ++ } ++ } ++ } ++ } ++ } ++ else { ++ std::cout << "grid sum:" << std::endl; ++ for (int i = 0; i < param->batch_size; i++) { ++ for (int j = 0; j < param->src_seq_len / 2; j++) { ++ for (int k = 0; k < hidden_size; k++) { ++ sum += ptr[((i * param->src_seq_len) + j) * hidden_size + k]; ++ } ++ } ++ } ++ } ++ } ++ else { ++ std::cout << "compress sum:" << std::endl; ++ for (int i = 0; i < param->h_token_num * hidden_size; i++) { ++ sum += ptr[i]; ++ } ++ } ++ return static_cast(sum); ++ } ++ else { ++ return static_cast(0.f); ++ } ++} ++ ++template ++void saveTensor(const std::string& name, T* tensor, int size) ++{ ++ auto tensor_host = std::make_unique(size); ++ T* ptr = tensor_host.get(); ++ cudaD2Hcpy(ptr, tensor, size); ++ std::ofstream wf(name + ".bin", std::ofstream::out | std::ofstream::binary); ++ wf.write(reinterpret_cast(ptr), size * sizeof(T)); ++ wf.close(); ++} + +void CublasGemmWrapper(const void* a_addr, + const void* b_addr, @@ -8207,7 +7855,6 @@ index 0000000..45d4dde + cudaDataType type_b = data_types[1]; + cudaDataType type_c = data_types[2]; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; -+ // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) { + compute_type = CUBLAS_COMPUTE_16F; + } @@ -8294,19 +7941,16 @@ index 0000000..45d4dde +template +size_t GetAttnWorkspaceSize(encoderParamT* param) +{ -+ size_t size_q = UP_DIV((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; -+ size_t size_k = UP_DIV((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE; ++ size_t size_q = ALIGN((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE); ++ size_t size_k = ALIGN((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE); + size_t size_v = size_k; + size_t qkv_len = size_q + size_k + size_v; -+ size_t q_buf_2_len = size_q; + size_t qk_buf_len = -+ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t qkv_buf_3_len = qkv_buf_2_len; ++ ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE); ++ size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); + size_t attn_out_size = -+ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; -+ return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T); -+ ++ ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE); ++ return (qkv_buf_2_len + 2 * attn_out_size + std::max(qkv_len, qk_buf_len)) * sizeof(T); +} + +template size_t GetAttnWorkspaceSize(encoderParamT* param); @@ -8314,11 +7958,15 @@ index 0000000..45d4dde +template +size_t GetEncoderLayerWorkspaceSize(encoderParamT* param) +{ -+ size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;; -+ size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn); -+ size_t out_size = (param->layernorm_post) ? attn_out : attn_out * 2; -+ return (std::max(GetAttnWorkspaceSize(param), ffn_size * sizeof(T)) + out_size * sizeof(T)); ++ size_t max_hidden = ALIGN(std::max(param->hidden_size, param->ffn_hidden_size),ALIGN_SIZE); ++ size_t compress_buffer_len = ALIGN(param->batch_size * param->src_seq_len * max_hidden,ALIGN_SIZE); ++ size_t padding_len = ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE); ++ size_t offset_len = ALIGN(param->batch_size,ALIGN_SIZE); ++ size_t d_token_len = ALIGN(1,ALIGN_SIZE); ++ size_t eft_size = compress_buffer_len * sizeof(T) + (padding_len + offset_len) * sizeof(int) + d_token_len * sizeof(size_t); ++ size_t attn_out = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ size_t ffn = ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE); ++ return (std::max(GetAttnWorkspaceSize(param), ffn * sizeof(T)) + (attn_out * 3) * sizeof(T)) + eft_size; +} + +template size_t GetEncoderLayerWorkspaceSize(encoderParamT* param); @@ -8328,7 +7976,7 @@ index 0000000..45d4dde +void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws) +{ + size_t inter_size = param->ffn_hidden_size; -+ size_t h_token_num = param->batch_size * param->src_seq_len; ++ size_t h_token_num = param->h_token_num; + cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; + cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; + if ((std::is_same::value) || (std::is_same::value)) { @@ -8382,17 +8030,61 @@ index 0000000..45d4dde +{ + param->in_idx = 0; + size_t h_token_num = param->batch_size * param->src_seq_len; -+ T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ param->h_token_num = h_token_num; ++ param->padding_offset = nullptr; ++ int* d_sequence_lengths = nullptr; ++ T* input_tensor = reinterpret_cast(inputs[param->in_idx++]); ++ T* from_tensor = input_tensor; ++ T* compress_buffer; ++ compress_buffer = reinterpret_cast(ws); ++ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(h_token_num * param->hidden_size,ALIGN_SIZE)); ++ int* padding_offset = reinterpret_cast(ws); ++ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE)); ++ d_sequence_lengths = reinterpret_cast(ws); ++ param->d_sequence_length = d_sequence_lengths; ++ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(param->batch_size,ALIGN_SIZE)); ++ size_t* d_token_num = reinterpret_cast(ws); ++ ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(1,ALIGN_SIZE)); ++ invokeBuildSequnceLength( ++ from_tensor, param->batch_size, d_sequence_lengths, param->src_seq_len, param->hidden_size, param->stream); ++ // printTensor("seq_len=",d_sequence_lengths,param->batch_size); ++ invokeGetPaddingOffset(&h_token_num, ++ d_token_num, ++ padding_offset, ++ d_sequence_lengths, ++ param->batch_size, ++ param->src_seq_len, ++ param->stream); ++ // std::cout << "token=" << h_token_num << "m=" << param->batch_size * param->src_seq_len << std::endl; ++ if (h_token_num * 2 <= param->batch_size * param->src_seq_len) { ++ param->eft = true; ++ invokeRemovePadding(compress_buffer, ++ (const T*)from_tensor, ++ padding_offset, ++ h_token_num, ++ param->head_num * param->head_size, ++ param->stream); ++ param->h_token_num = h_token_num; ++ param->padding_offset = padding_offset; ++ from_tensor = compress_buffer; ++ } ++ h_token_num = param->h_token_num; + T* attn_out = reinterpret_cast(ws); -+ T* normed_from_tensor = reinterpret_cast(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* normed_from_tensor = ++ reinterpret_cast(ws) + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); + T* attn_ws_offset = (param->layernorm_post) ? reinterpret_cast(ws) : reinterpret_cast(normed_from_tensor); -+ T* attn_ws = attn_ws_offset + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ T* attn_ws = attn_ws_offset + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); + T* normed_attn_out = normed_from_tensor; -+ T* ffn_ws = normed_attn_out + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ ++ T* ffn_ws = normed_attn_out + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ + T* tmp_out = reinterpret_cast(output[0]); -+ if (std::is_same::value && param->ffn_fp16==true) { -+ tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE; ++ if (param->padding_offset != nullptr || (std::is_same::value && param->ffn_fp16 == true)) { ++ tmp_out = ffn_ws + ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE); ++ } ++ T* tmp_out1 = reinterpret_cast(output[0]); ++ T* out_buf = tmp_out; ++ if (param->padding_offset != nullptr) { ++ tmp_out1 = compress_buffer; + } + if (param->layernorm_post == false) { + T* gamma1 = reinterpret_cast(inputs[param->in_idx++]); @@ -8423,96 +8115,100 @@ index 0000000..45d4dde + T* gamma2 = reinterpret_cast(inputs[param->in_idx++]); + T* beta2 = reinterpret_cast(inputs[param->in_idx++]); + if (param->layernorm_post == false) { -+ if (std::is_same::value || param->ffn_fp16==false) { ++ if (std::is_same::value || param->ffn_fp16 == false) { + invokeGeneralAddBiasResidualPreLayerNorm(attn_out, -+ normed_attn_out, -+ from_tensor, -+ gamma2, // gamma -+ beta2, // beta -+ projection_bias, -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ } else { -+ invokeGeneralAddBiasResidualPreLayerNormCast(attn_out, -+ reinterpret_cast(normed_attn_out), -+ from_tensor, -+ gamma2, // gamma -+ beta2, // beta -+ projection_bias, -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); ++ normed_attn_out, ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); + } -+ } else { -+ if (std::is_same::value || param->ffn_fp16==false) { -+ invokeAddBiasResidualLayerNorm( -+ attn_out, -+ from_tensor, -+ projection_bias, -+ gamma2, // gamma -+ beta2, // beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps1); -+ normed_attn_out = attn_out; -+ } else { -+ invokeAddBiasResidualLayerNormCast( -+ reinterpret_cast(attn_out), -+ reinterpret_cast(normed_attn_out), -+ reinterpret_cast(from_tensor), -+ projection_bias, -+ gamma2, // gamma -+ beta2, // beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps1); -+ // isNan((char*)"LN 1 model", (half*)attn_out, h_token_num * param->hidden_size); ++ else { ++ invokeGeneralAddBiasResidualPreLayerNormCast(attn_out, ++ reinterpret_cast(normed_attn_out), ++ from_tensor, ++ gamma2, // gamma ++ beta2, // beta ++ projection_bias, ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } ++ } ++ else { ++ if (std::is_same::value || param->ffn_fp16 == false) { ++ invokeAddBiasResidualLayerNorm(attn_out, ++ from_tensor, ++ projection_bias, ++ gamma2, // gamma ++ beta2, // beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ normed_attn_out = attn_out; ++ } ++ else { ++ invokeAddBiasResidualLayerNormCast(reinterpret_cast(attn_out), ++ reinterpret_cast(normed_attn_out), ++ reinterpret_cast(from_tensor), ++ projection_bias, ++ gamma2, // gamma ++ beta2, // beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps1); ++ // isNan((char*)"LN 1 model", (half*)attn_out, h_token_num * param->hidden_size); + } + } + } + else { + // without projection bias + } -+ // forward ffn ++ // forward ffn + // simulate attention inputs + inputs[--param->in_idx] = normed_attn_out; -+ if (param->ffn_fp16==false) { ++ if (param->ffn_fp16 == false) { + forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); -+ } else { ++ } ++ else { + forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, param, ffn_ws); + } + if (param->layernorm_post == true) { -+ if (std::is_same::value || param->ffn_fp16==false) { ++ if (std::is_same::value || param->ffn_fp16 == false) { + invokeAddBiasResidualLayerNorm(reinterpret_cast(tmp_out), -+ attn_out, -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, -+ reinterpret_cast(inputs[param->in_idx++]), // Gamma -+ reinterpret_cast(inputs[param->in_idx++]), // Beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); -+ -+ } else { ++ attn_out, ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, ++ reinterpret_cast(inputs[param->in_idx++]), // Gamma ++ reinterpret_cast(inputs[param->in_idx++]), // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ } ++ else { + invokeAddBiasResidualLayerNormCast( -+ reinterpret_cast(tmp_out), -+ reinterpret_cast(output[0]), -+ reinterpret_cast(normed_attn_out), -+ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, -+ reinterpret_cast(inputs[param->in_idx++]), // Gamma -+ reinterpret_cast(inputs[param->in_idx++]), // Beta -+ h_token_num, -+ param->hidden_size, -+ param->stream, -+ param->eps2); ++ reinterpret_cast(tmp_out), ++ reinterpret_cast(tmp_out1), ++ reinterpret_cast(normed_attn_out), ++ reinterpret_cast(inputs[param->in_idx++]), // FFN bias, ++ reinterpret_cast(inputs[param->in_idx++]), // Gamma ++ reinterpret_cast(inputs[param->in_idx++]), // Beta ++ h_token_num, ++ param->hidden_size, ++ param->stream, ++ param->eps2); ++ out_buf = tmp_out1; + } -+ } else { -+ if (std::is_same::value || param->ffn_fp16==false) { ++ } ++ else { ++ if (std::is_same::value || param->ffn_fp16 == false) { + invokeAddBiasResidual(reinterpret_cast(tmp_out), + attn_out, + reinterpret_cast(inputs[param->in_idx++]), // FFN bias @@ -8523,14 +8219,21 @@ index 0000000..45d4dde + else { + invokeAddBiasResidualCast(reinterpret_cast(tmp_out), + reinterpret_cast(attn_out), -+ reinterpret_cast(output[0]), ++ reinterpret_cast(tmp_out1), + reinterpret_cast(inputs[param->in_idx++]), // FFN bias + h_token_num, + param->hidden_size, + param->stream); + } + } -+ ++ if (param->padding_offset != nullptr) { ++ cudaMemsetAsync(output[0], ++ 0, ++ param->batch_size * param->src_seq_len * param->head_size * param->head_num * sizeof(T), ++ param->stream); ++ invokeRebuildPadding( ++ (T*)output[0], out_buf, param->padding_offset, h_token_num, param->hidden_size, param->stream); ++ } + return; +} + @@ -8544,27 +8247,20 @@ index 0000000..45d4dde +{ + param->in_idx = 0; + auto extra_tmp_size = -+ UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t size_q = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t size_k = UP_DIV(param->batch_size * param->tgt_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t size_v = size_k; -+ -+ size_t qkv_len = size_q + size_k + size_v; ++ ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE); ++ size_t size_q = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); + size_t q_buf_2_len = size_q; + size_t qk_buf_len = -+ UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE; -+ size_t qkv_buf_3_len = qkv_buf_2_len; -+ auto buff_size = qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len; -+ T* qkv_buf = (T*)ws; -+ T* q_buf_2 = static_cast(qkv_buf) + qkv_len; -+ T* qk_buf = static_cast(q_buf_2) + q_buf_2_len; -+ T* qkv_buf_2 = static_cast(qk_buf) + qk_buf_len; -+ T* qkv_buf_3 = static_cast(qkv_buf_2) + qkv_buf_2_len; -+ T* output1 = static_cast(ws) + buff_size; ++ ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE); ++ size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE); ++ T* q_buf_2 = (T*)ws; ++ T* output1 = static_cast(ws) + q_buf_2_len; + T* output2 = static_cast(output1) + extra_tmp_size; -+ int gemm_dims[] = { -+ 3 * (int)param->hidden_size, (int)param->batch_size * (int)param->src_seq_len, (int)param->hidden_size}; ++ T* qkv_buf = static_cast(output2) + extra_tmp_size; ++ T* qk_buf = qkv_buf; ++ T* qkv_buf_2 = q_buf_2; ++ T* qkv_buf_3 = qk_buf; ++ int gemm_dims[] = {3 * (int)param->hidden_size, (int)param->h_token_num, (int)param->hidden_size}; + int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size}; + T* from_tensor = reinterpret_cast(inputs[param->in_idx++]); + cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N}; @@ -8642,20 +8338,41 @@ index 0000000..45d4dde + &beta, + param->cublas_handle, + param->algo); ++ + T* bias_qkv = (param->qkv_bias) ? reinterpret_cast(inputs[param->in_idx++]) : nullptr; -+ fastertransformer::invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), -+ static_cast(output1), -+ static_cast(output2), -+ static_cast(qkv_buf), -+ bias_qkv, -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ 0, -+ param->stream); ++ if (param->padding_offset == nullptr) { ++ invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), ++ static_cast(output1), ++ static_cast(output2), ++ static_cast(qkv_buf), ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ 0, ++ param->stream); ++ } ++ else { ++ invokeAddFusedZP_QKVBiasTranspose(static_cast(q_buf_2), ++ static_cast(output1), ++ static_cast(output2), ++ static_cast(qkv_buf), ++ bias_qkv, ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->h_token_num, ++ param->padding_offset, ++ param->stream); ++ } + } + gemm_ops[0] = CUBLAS_OP_T; ++ gemm_ops[1] = CUBLAS_OP_N; ++ gemm_dims[0] = param->tgt_seq_len; ++ gemm_dims[1] = param->src_seq_len; ++ gemm_dims[2] = param->head_size; + + gemm_lds[0] = param->head_size; + gemm_lds[1] = param->head_size; @@ -8665,10 +8382,6 @@ index 0000000..45d4dde + (int)(param->src_seq_len * param->head_size), + (int)(param->src_seq_len * param->tgt_seq_len)}; + -+ gemm_dims[0] = param->tgt_seq_len; -+ gemm_dims[1] = param->src_seq_len; -+ gemm_dims[2] = param->head_size; -+ + CublasGemmStridedBatchedWrapper(output1, + q_buf_2, + qk_buf, @@ -8684,20 +8397,23 @@ index 0000000..45d4dde + param->algo); + + T* attention_mask = reinterpret_cast(inputs[param->in_idx++]); ++ if (param->padding_offset != nullptr) ++ invokeBuildEncoderAttentionMask( ++ attention_mask, param->d_sequence_length, param->batch_size, param->src_seq_len, param->stream); + T* position_bias = nullptr; + if (param->position_bias) { + position_bias = reinterpret_cast(inputs[param->in_idx++]); + } + T scalar = static_cast(1.0f / sqrtf(param->head_size * 1.0f)); -+ fastertransformer::invokeMixMaskedSoftMax(static_cast(qk_buf), -+ attention_mask, -+ position_bias, -+ param->batch_size, -+ param->src_seq_len, -+ param->tgt_seq_len, -+ param->head_num, -+ scalar, -+ param->stream); ++ invokeMixMaskedSoftMax(static_cast(qk_buf), ++ attention_mask, ++ position_bias, ++ param->batch_size, ++ param->src_seq_len, ++ param->tgt_seq_len, ++ param->head_num, ++ scalar, ++ param->stream); + + gemm_ops[0] = CUBLAS_OP_N; + gemm_ops[1] = CUBLAS_OP_N; @@ -8712,6 +8428,7 @@ index 0000000..45d4dde + gemm_strides[0] = param->tgt_seq_len * param->head_size; + gemm_strides[1] = param->src_seq_len * param->tgt_seq_len; + gemm_strides[2] = param->src_seq_len * param->head_size; ++ + CublasGemmStridedBatchedWrapper(output2, + qk_buf, + qkv_buf_2, @@ -8726,17 +8443,30 @@ index 0000000..45d4dde + param->cublas_handle, + param->algo); + -+ invokeTransposeQKV(static_cast(qkv_buf_3), -+ static_cast(qkv_buf_2), -+ param->batch_size, -+ param->src_seq_len, -+ param->head_num, -+ param->head_size, -+ param->stream); ++ if (param->padding_offset == nullptr) { ++ invokeTransposeQKV(static_cast(qkv_buf_3), ++ static_cast(qkv_buf_2), ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->stream); ++ } ++ else { ++ invokeTransposeAttentionOutRemovePadding(qkv_buf_2, ++ qkv_buf_3, ++ param->h_token_num, ++ param->batch_size, ++ param->src_seq_len, ++ param->head_num, ++ param->head_size, ++ param->padding_offset, ++ param->stream); ++ } + gemm_ops[0] = CUBLAS_OP_N; + gemm_ops[1] = CUBLAS_OP_N; + gemm_dims[0] = param->hidden_size; -+ gemm_dims[1] = param->batch_size * param->src_seq_len; ++ gemm_dims[1] = param->h_token_num; + gemm_dims[2] = param->hidden_size; + + gemm_lds[0] = param->hidden_size; @@ -8755,7 +8485,7 @@ index 0000000..45d4dde + param->algo); + + if (param->projection_bias) { -+ int len = param->batch_size * param->src_seq_len; ++ int len = param->h_token_num; + invokeAddBias( + static_cast(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream); + } @@ -8776,10 +8506,10 @@ index 0000000..45d4dde +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/encoder_layers/encoder.h b/src/fastertransformer/layers/encoder_layers/encoder.h new file mode 100644 -index 0000000..0caaed1 +index 0000000..2ae0ad3 --- /dev/null +++ b/src/fastertransformer/layers/encoder_layers/encoder.h -@@ -0,0 +1,48 @@ +@@ -0,0 +1,50 @@ +#pragma once + +#include "src/fastertransformer/kernels/activation_kernels.h" @@ -8807,12 +8537,14 @@ index 0000000..0caaed1 + cublasGemmAlgo_t algo; + // ctrls + int in_idx; -+ bool qkv_bias; // ture -+ bool projection_bias; // ture ++ bool qkv_bias; // true ++ bool projection_bias; // true + bool is_cross; // false + bool position_bias; // false + bool layernorm_post; // dont care ++ bool eft; // false - effective fast trn + int *padding_offset; ++ int *d_sequence_length; +} encoderParamT; +void CublasGemmWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const cudaDataType* data_types, void* alpha, void* beta, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); +void CublasGemmStridedBatchedWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const int* strides, const cudaDataType* data_types, void* alpha, void* beta, int batch, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo); @@ -9185,10 +8917,10 @@ index 0000000..299ffb6 +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc new file mode 100644 -index 0000000..f5f6815 +index 0000000..c405ba5 --- /dev/null +++ b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc -@@ -0,0 +1,192 @@ +@@ -0,0 +1,211 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. @@ -9367,6 +9099,25 @@ index 0000000..f5f6815 + (void*)decoder_weights->decoder_output_projection.bias}; + fastertransformer::forwardDecoder(inputs, 23, outputs, 1, ¶ms_, buf_); + } ++ if (params_.attn1.position_bias && params_.attn2.position_bias) { ++ void* inputs[] = {(void*)input_tensors->at(0).data, ++ (void*)decoder_weights->layernorm1.gamma, ++ (void*)decoder_weights->attention.query_weight.kernel, ++ (void*)input_tensors->at(4).data, ++ (void*)input_tensors->at(1).data, ++ (void*)decoder_weights->attention.attention_output_weight.kernel, ++ (void*)decoder_weights->layernorm2.gamma, ++ (void*)input_tensors->at(2).data, ++ (void*)decoder_weights->cross_attention.query_weight.kernel, ++ (void*)decoder_weights->cross_attention.key_weight.kernel, ++ (void*)input_tensors->at(5).data, ++ (void*)input_tensors->at(3).data, ++ (void*)decoder_weights->cross_attention.attention_output_weight.kernel, ++ (void*)decoder_weights->layernorm3.gamma, ++ (void*)decoder_weights->decoder_output_mapping.kernel, ++ (void*)decoder_weights->decoder_output_projection.kernel}; ++ fastertransformer::forwardDecoder(inputs, 23, outputs, 1, ¶ms_, buf_); ++ } + else{} + return; +} @@ -10898,11 +10649,12 @@ index 0000000..a0f6698 +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/ms_layers/param.h b/src/fastertransformer/layers/ms_layers/param.h new file mode 100644 -index 0000000..a76004c +index 0000000..4b5234d --- /dev/null +++ b/src/fastertransformer/layers/ms_layers/param.h -@@ -0,0 +1,52 @@ +@@ -0,0 +1,56 @@ +#pragma once ++namespace fastertransformer { +typedef struct{ + public: + size_t batch_size; @@ -10941,6 +10693,7 @@ index 0000000..a76004c + mutable attentionParamT attn1; + mutable attentionParamT attn2; + bool layernorm_post; ++ bool has_bias; + int *padding_offset; +} decoderParamT; + @@ -10951,9 +10704,12 @@ index 0000000..a76004c + float eps2; + bool projection_bias; // ture + mutable attentionParamT attn; -+ bool layernorm_post; ++ bool layernorm_post; ++ bool has_bias; + int *padding_offset; +} encoderParamT; ++} +\ No newline at end of file diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt index af33e76..97fc471 100644 --- a/src/fastertransformer/models/CMakeLists.txt diff --git a/trc/transformer/T5/transformer.py b/trc/transformer/T5/transformer.py index a369b9d0aed..9c8d5337bf2 100644 --- a/trc/transformer/T5/transformer.py +++ b/trc/transformer/T5/transformer.py @@ -422,6 +422,7 @@ class FeedForward(Cell): parallel_config=default_dpmp_config): super(FeedForward, self).__init__() _check_config(parallel_config) + self.dtype = param_init_type dp = parallel_config.data_parallel mp = parallel_config.model_parallel if ffn_hidden_size % mp != 0: @@ -480,7 +481,7 @@ class FeedForward(Cell): def construct(self, x): _check_input_shape(F.shape(x), "x", self.cls_name, [2, 3]) _check_input_dtype(F.dtype(x), "x", [mstype.float32, mstype.float16], self.cls_name) - x = self.cast(x, mstype.float16) + x = self.cast(x, self.dtype) # returned shape is [bs, seq_length, ffn_hidden_size] or [bs * seq_length, ffn_hidden_size] hidden = self.mapping(x) output = self.projection(hidden) @@ -794,7 +795,6 @@ class MultiHeadAttention(Cell): tgt_seq_length, hidden_size, num_heads, - # app, hidden_dropout_rate=0.1, attention_dropout_rate=0.1, compute_dtype=mstype.float32, @@ -812,7 +812,6 @@ class MultiHeadAttention(Cell): self.tgt_seq_length = tgt_seq_length self.hidden_size = hidden_size self.batch_size = batch_size - # self.app=app if hidden_dropout_rate < 0 or hidden_dropout_rate >= 1: raise ValueError("For 'MultiHeadAttention', the class variable 'hidden_dropout_rate' must be " "in range [0, 1.0), but got the value : {}.".format(hidden_dropout_rate)) @@ -1018,10 +1017,10 @@ class MultiHeadAttention(Cell): output = self.dropout(output) output = F.reshape(output, ori_shape) # if self.app=="trc": - # return output, layer_present, position_bias + return output, layer_present, position_bias # else: # return output - return output + # return output def _check_inputs(self, query_tensor, key_tensor, value_tensor, attention_mask, key_past=None, value_past=None, batch_valid_length=None): @@ -1430,7 +1429,7 @@ class TransformerEncoderLayer(Cell): self.mul = P.Mul().shard(((1, 1, 1, 1), (1,))) self.assign = P.Assign().shard(((1, 1, 1, 1), (1, 1, 1, 1))) - def construct(self, x, input_mask, init_reset=True, batch_valid_length=None, position_bias=None): + def construct(self, x, input_mask, position_bias=None, init_reset=True, batch_valid_length=None): self._check_input(x, input_mask, init_reset, batch_valid_length) x_shape = F.shape(x) x = F.reshape(x, (-1, x_shape[-1])) @@ -1449,8 +1448,8 @@ class TransformerEncoderLayer(Cell): input_x = F.depend(input_x, key_reset) input_x = F.depend(input_x, value_reset) - attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, input_mask, - self.key_past, self.value_past, batch_valid_length, position_bias) + attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, input_mask, position_bias, + self.key_past, self.value_past, batch_valid_length) # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm if self.post_layernorm_residual: x = self.add(input_x, attention) @@ -1500,8 +1499,8 @@ class TransformerEncoderLayer(Cell): output = F.reshape(output, x_shape) if self.use_moe is True: - return output, layer_present, aux_loss - return output, layer_present, position_bias + return output#, layer_present, aux_loss + return output#, layer_present, position_bias def _check_input(self, x, input_mask, init_reset, batch_valid_length): r"""Check inputs""" @@ -1685,6 +1684,7 @@ class TransformerDecoderLayer(Cell): self.hidden_size = hidden_size self.layernorm1 = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type) + self.layernorm1.shard(((parallel_config.data_parallel, 1),)) self.layernorm2 = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type) self.layernorm2.shard(((parallel_config.data_parallel, 1),)) @@ -1719,6 +1719,7 @@ class TransformerDecoderLayer(Cell): has_relative_attention_bias=has_relative_attention_bias, parallel_config=parallel_config) self.cross_attention_layernorm = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type) + self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),)) _check_moe_config(moe_config, parallel_config) self.use_moe = (moe_config.expert_num > 1) @@ -1742,7 +1743,7 @@ class TransformerDecoderLayer(Cell): self.post_layernorm_residual = post_layernorm_residual self.add = P.Add().shard(((parallel_config.data_parallel, 1), (parallel_config.data_parallel, 1))) self.add_3d = P.Add().shard(((parallel_config.data_parallel, 1, 1), (parallel_config.data_parallel, 1, 1))) - self.dtype = mstype.float16 + self.dtype = mstype.float32 self.key_past = None self.value_past = None if self.use_past: @@ -1764,15 +1765,16 @@ class TransformerDecoderLayer(Cell): decoder_mask, encoder_output=None, memory_mask=None, - init_reset=True, batch_valid_length=None, - position_bias=None, encoder_decoder_position_bias=None): + position_bias=None, encoder_decoder_position_bias=None, + init_reset=True, batch_valid_length=None): #self._check_input(hidden_stats, decoder_mask, encoder_output, memory_mask, init_reset, batch_valid_length) # the returned shape is [bs, seq_length, embedding_size] or [bs * seq_length, embedding_size] hidden_shape = F.shape(hidden_stats) hidden_stats = F.reshape(hidden_stats, (-1, hidden_shape[-1])) input_x = self.layernorm1(hidden_stats) input_x = F.cast(input_x, self.dtype) - + init_reset = True + batch_valid_length=None # indicate whether reset saved states key_reset = None value_reset = None @@ -1784,8 +1786,8 @@ class TransformerDecoderLayer(Cell): input_x = F.depend(input_x, key_reset) input_x = F.depend(input_x, value_reset) - attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, decoder_mask, self.key_past, - self.value_past, batch_valid_length, position_bias) + attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, decoder_mask, position_bias, self.key_past, + self.value_past, batch_valid_length) # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm if self.post_layernorm_residual: x = self.add(input_x, attention) @@ -1794,13 +1796,15 @@ class TransformerDecoderLayer(Cell): x = self.add(hidden_stats, attention) middle_output = None + cross_attn_output = None if encoder_output is not None: middle_output = self.cross_attention_layernorm(x) middle_output = F.cast(middle_output, self.dtype) cross_attn_output, cross_layer_present, encoder_decoder_position_bias = self.cross_attention(middle_output, encoder_output, encoder_output, - memory_mask, self.key_past, - self.value_past, batch_valid_length, encoder_decoder_position_bias) + memory_mask, encoder_decoder_position_bias, self.key_past, + self.value_past, batch_valid_length) + # return cross_attn_output layer_present += cross_layer_present if self.post_layernorm_residual: x = self.add(middle_output, cross_attn_output) @@ -1849,8 +1853,8 @@ class TransformerDecoderLayer(Cell): output = F.reshape(output, hidden_shape) if self.use_moe is True: - return output, layer_present, aux_loss - return output, layer_present, position_bias, encoder_decoder_position_bias + return output#, layer_present, aux_loss + return output#, layer_present, position_bias, encoder_decoder_position_bias def _check_input(self, hidden_states, attention_mask, encoder_output, memory_mask, init_reset, batch_valid_length): r"""Check inputs""" diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 99e4f5bd9ab..cc543ad3d77 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_encoder_layer,128];token_type_ids:[transformer_encoder_layer,128];input_mask:[transformer_encoder_layer,128] +input_shape=input_ids:[transformer_decoder_layer,128];token_type_ids:[transformer_decoder_layer,128];input_mask:[transformer_decoder_layer,128] diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 742bd01d4f7..6a3cda3ef60 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -4,13 +4,13 @@ base=`git rev-parse --show-toplevel` version=$(cat ${base}/version.txt) system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 benchmark=${system}/tools/benchmark/benchmark -readir=${base}/trc/readers/mindir/readir server=caspi gpu_id=2 # move files to caspi model=${1%.mindir} model=${model#convv_} model=$(echo ${model}| sed 's/_fwd//') +batch_size=$(echo ${model}| sed 's/bert//') echo "model=${model}" model_name=$(echo ${model}) if [[ "$batch_size" != "${model}" ]];then @@ -34,12 +34,8 @@ rsync -v $1 ${server}:$(realpath $1) rsync -v ${benchmark} ${server}:${benchmark} rsync -vl ${system}/runtime/lib/* ${server}:${system}/runtime/lib/ rsync -vl ${system}/tools/converter/lib/* ${server}:${system}/tools/converter/lib/ -#build configuration file -cfg=$(${readir} -i $1) -newline=$'\n' -echo "[gpu_context]${newline}input_shape=${cfg}${newline}" > cfg_${model}.config -rsync -v cfg_${model}.config ${server}:$(realpath "cfg_${model}.config") - +echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_bert.config +rsync -v cfg_${model_name}.config ${server}:$(realpath "cfg_${model_name}.config") # this should be more general ! # output_files=$(find . -maxdepth 1 -name ${model}_compress_output"*.txt*" | sort -n) @@ -58,8 +54,9 @@ then command+="--inDataFile=\"${input_files}\"" command+=" --benchmarkDataFile=\"${output_files}\" " fi -if [ -f cfg_${model}.config ]; then - command+="--configFile=cfg_${model}.config " + +if [ -f cfg_${model_name}.config ]; then + command+="--configFile=cfg_${model_name}.config " fi command+="--device=GPU " #command+="--enableFp16=true" diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 9661e38b816..95787701702 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -20,7 +20,7 @@ #-b 16 -l 24 -H 16 -S 1024 -s 128 -P 0 -m bert #-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer -b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer @@ -52,7 +52,7 @@ #-b 1 -l 66 -s 20 -t 30 -H 3 -S 15 -p 0 -m mha_cross #-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 #-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross --b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 8 -l 12 -H 4 -S 512 -s 128 -P 0 -f 3072 -m transformer_encoder_layer #-b 16 -l 16 -H 8 -S 1024 -s 64 -P 1 -f 1024 -m transformer_encoder_layer #-b 32 -l 12 -H 4 -S 512 -s 128 -P 0 -f 3072 -m transformer_encoder_layer diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index d8b055d5f4b..30a3158797d 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -342,60 +342,43 @@ def transformer_encoder_layer_create(): saveT(y, name + "_output1.fp" + suffix) -def transformer_encoder_layer_T5_create(): +def transformer_encoder_layer_t5_create(): post_layernorm=False - name = "transformer_encoder_layer_T5" + name = "transformer_encoder_layer_t5" model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, num_heads=head_num, post_layernorm_residual=post_layernorm, has_bias=False) encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) - # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size - # k = model.attention.dense2.weight.asnumpy()#.transpose() - # v = model.attention.dense3.weight.asnumpy()#.transpose() + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() - # w = np.concatenate((q, k, v)) # 3xhid_size x hid_size - # w = w.transpose() # hid_size x 3xhid_size - # wt = M.Tensor(w, w_compute_type) - # bq = model.attention.dense1.bias.asnumpy() - # bk = model.attention.dense2.bias.asnumpy() - # bv = model.attention.dense3.bias.asnumpy() - # bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 - # bt =M.Tensor(bw, w_compute_type) - # wp = model.attention.projection.weight - # bp = model.attention.projection.bias - # omw = model.output.mapping.weight - # opw = model.output.projection.weight - # omb = model.output.mapping.bias - # opb = model.output.projection.bias - # gl1 = model.layernorm1.gamma - # bl1 = model.layernorm1.beta - # gl2 = model.layernorm2.gamma - # bl2 = model.layernorm2.beta + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + wp = model.attention.projection.weight + omw = model.output.mapping.weight + opw = model.output.projection.weight + gl1 = model.layernorm1.weight + gl2 = model.layernorm2.weight suffix = str(compute_type) suffix = suffix[-2:] saveT(encoder_input_value, name + "_input1.fp" + suffix) saveT(encoder_input_mask, name + "_input2.fp" + suffix) saveT(pos, name + "_input3.fp" + suffix) - # saveT(gl1, name + "_weight1.fp" + suffix) - # saveT(bl1, name + "_weight2.fp" + suffix) - # saveT(wt, name + "_weight3.fp" + suffix) - # saveT(bt, name + "_weight4.fp" + suffix) - # saveT(wp, name + "_weight5.fp" + suffix) - # saveT(bp, name + "_weight6.fp" + suffix) - # saveT(gl2, name + "_weight7.fp" + suffix) - # saveT(bl2, name + "_weight8.fp" + suffix) - # if ffn_fp16 == True: - # saveTensorToHalf(omw, name + "_weight9.fp" + "16") - # saveTensorToHalf(omb, name + "_weight10.fp" + "16") - # saveTensorToHalf(opw, name + "_weight11.fp" + "16") - # else: - # saveT(omw, name + "_weight9.fp" + suffix) - # saveT(omb, name + "_weight10.fp" + suffix) - # saveT(opw, name + "_weight11.fp" + suffix) - # saveT(opb, name + "_weight12.fp" + suffix) + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(wt, name + "_weight2.fp" + suffix) + saveT(wp, name + "_weight3.fp" + suffix) + saveT(gl2, name + "_weight4.fp" + suffix) + if ffn_fp16 == True: + saveTensorToHalf(omw, name + "_weight5.fp" + "16") + saveTensorToHalf(opw, name + "_weight6.fp" + "16") + else: + saveT(omw, name + "_weight5.fp" + suffix) + saveT(opw, name + "_weight6.fp" + suffix) _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask, -- Gitee From f94a1b17ca90c306dd6c687ac7edd68df6a2ee6d Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 4 Jan 2023 10:48:28 +0200 Subject: [PATCH 13/39] for merge --- .../delegate/tensorrt/op/encoder_tensorrt.cc | 16 +++++++++------- trc/transformer/cfg_bert.config | 2 +- trc/transformer/models.txt | 7 ++++--- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index ccec39912b6..94aa2ab9f1d 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -111,7 +111,7 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn.head_num = encoder_op->get_head_num(); params.attn.head_size = encoder_op->get_head_size(); params.attn.cublas_handle = GetCublasHandle(); - params.attn.projection_bias = false; + params.attn.projection_bias = true; params.attn.hidden_size = params.head_num * params.head_size; params.attn.is_cross = false; params.attn.position_bias = encoder_op->get_position_bias(); @@ -120,6 +120,10 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; size_t end_fp16 = (params.layernorm_post) ? C11NUM : C13NUM; + if (params.position_bias) { + start_fp16 = C5NUM; + end_fp16 = C8NUM; + } for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { @@ -180,12 +184,10 @@ int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, params_.algo = algoId; params_.attn.stream = stream; params_.attn.algo = algoId; - void *inputs_forward[] = { - const_cast(inputs[0]), const_cast(inputs[1]), const_cast(inputs[2]), - const_cast(inputs[3]), const_cast(inputs[4]), const_cast(inputs[5]), - const_cast(inputs[6]), const_cast(inputs[7]), const_cast(inputs[8]), - const_cast(inputs[9]), const_cast(inputs[10]), const_cast(inputs[11]), - const_cast(inputs[12]), const_cast(inputs[13])}; + void *inputs_forward[num_of_inputs_]; + for (int i=0; i < num_of_inputs_; i++){ + inputs_forward[i]=const_cast(inputs[i]); + } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardEncoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, workspace); diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index cc543ad3d77..99e4f5bd9ab 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_decoder_layer,128];token_type_ids:[transformer_decoder_layer,128];input_mask:[transformer_decoder_layer,128] +input_shape=input_ids:[transformer_encoder_layer,128];token_type_ids:[transformer_encoder_layer,128];input_mask:[transformer_encoder_layer,128] diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 95787701702..6749b2f6a88 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -20,10 +20,11 @@ #-b 16 -l 24 -H 16 -S 1024 -s 128 -P 0 -m bert #-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer - +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 -- Gitee From 7eb392b052ad0bbd72e6acb39a832731cd2df757 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 4 Jan 2023 12:34:31 +0200 Subject: [PATCH 14/39] fix tensorrrt --- .../delegate/tensorrt/op/encoder_tensorrt.cc | 6 +++- .../delegate/tensorrt/op/encoder_tensorrt.h | 2 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/models.txt | 14 ++++---- trc/transformer/train_transformer_export.py | 35 ++++++++++--------- 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 94aa2ab9f1d..9873f42d56a 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -111,11 +111,15 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn.head_num = encoder_op->get_head_num(); params.attn.head_size = encoder_op->get_head_size(); params.attn.cublas_handle = GetCublasHandle(); - params.attn.projection_bias = true; + params.attn.projection_bias = !params.attn.position_bias; params.attn.hidden_size = params.head_num * params.head_size; params.attn.is_cross = false; params.attn.position_bias = encoder_op->get_position_bias(); params.attn.qkv_bias = !params.attn.position_bias; + params.has_beta = !params.attn.position_bias; + params.has_bias = !params.attn.position_bias; + params.ffn_bias = !params.attn.position_bias; + auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index 7538b078d16..031026a98b1 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -22,7 +22,7 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" -#include "src/fastertransformer/layers/encoder_layers/encoder.h" +#include "src/fastertransformer/layers/ms_layers/encoder.h" #include "src/fastertransformer/layers/ms_layers/param.h" namespace mindspore::lite { diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 99e4f5bd9ab..370f5e1aba9 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_encoder_layer,128];token_type_ids:[transformer_encoder_layer,128];input_mask:[transformer_encoder_layer,128] +input_shape=input_ids:[transformer_encoder_layer_t5,128];token_type_ids:[transformer_encoder_layer_t5,128];input_mask:[transformer_encoder_layer_t5,128] diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 75eeda80e0c..c9e06bb35eb 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -11,8 +11,8 @@ #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert @@ -20,11 +20,11 @@ #-b 16 -l 24 -H 16 -S 1024 -s 128 -P 0 -m bert #-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer - --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +# +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index c5744e40ba8..2b8dcd72713 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -243,7 +243,6 @@ def build_transformer_encoder(batch_size = 2, seq_length = 16): export(model, encoder_input_value, encoder_input_mask, file_name= name + "_fwd", file_format='MINDIR') def transformer_encoder_layer_create(): - post_layernorm=False name = "transformer_encoder_layer" if (post_layernorm): model = TransformerEncoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, @@ -342,10 +341,14 @@ def transformer_encoder_layer_create(): saveT(y, name + "_output1.fp" + suffix) def transformer_encoder_layer_t5_create(): - post_layernorm=False name = "transformer_encoder_layer_t5" - model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, - num_heads=head_num, post_layernorm_residual=post_layernorm, has_bias=False) + if (post_layernorm): + print("post_layernorm") + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, post_layernorm_residual=True, has_bias=False) + else: + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, has_bias=False) encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) @@ -385,16 +388,16 @@ def transformer_encoder_layer_t5_create(): y = model(encoder_input_value, encoder_input_mask, position_bias = pos) export(model, encoder_input_value, encoder_input_mask, pos, file_name= name + "_fwd", file_format='MINDIR') - if app=="ch": - f_y=open(f'./{name}_output.txt','w') - out_name='output1' - print("name output:",out_name) - saveCalib(out_name, np.array(y), f_y) - print("y.shape",np.array(y).shape) - # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims + # if app=="ch": + f_y=open(f'./{name}_output.txt','w') + out_name='output1' + print("name output:",out_name) + saveCalib(out_name, np.array(y), f_y) + print("y.shape",np.array(y).shape) + # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims - elif app=="trc": - saveT(y, name + "_output1.fp" + suffix) + # elif app=="trc": + saveT(y, name + "_output1.fp" + suffix) def transformer_decoder_layer_t5_create(): @@ -422,14 +425,13 @@ def transformer_decoder_layer_t5_create(): w = w.transpose() # hid_size x 3xhid_size wt = M.Tensor(w, w_compute_type) wp = model.attention.projection.weight - bp = model.attention.projection.bias qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() - w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size - w2 = w.transpose() # hid_size x 3xhid_size + w2 = np.concatenate((k2, v2)) # 2xhid_size x hid_size + w2 = w2.transpose() # hid_size x 2xhid_size wt2 = M.Tensor(w2, w_compute_type) wp2 = model.cross_attention.projection.weight omw = model.output.mapping.weight @@ -468,6 +470,7 @@ def transformer_decoder_layer_t5_create(): saveT(encoder_pos, name + "_input6.fp" + suffix) _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos) y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , position_bias=pos, encoder_decoder_position_bias = encoder_pos) + print("omw.shape",np.array(omw).shape) export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos, file_name= name + "_fwd", file_format='MINDIR') f_y=open(f'./{name}_output.txt','w') saveCalib("output1", np.array(y), f_y)#2 dims -- Gitee From 2c88f85a1fe969d0588e749b75b64b779e7ec2b3 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 4 Jan 2023 12:47:11 +0200 Subject: [PATCH 15/39] fix decoder tensorrt --- .../src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc | 6 +++++- .../src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc | 4 ++-- trc/transformer/T5/transformer.py | 2 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/models.txt | 2 +- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index a96c5951d9c..f79c9eb0c48 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -112,6 +112,10 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); params.ffn_fp16 = is_ffn_fp16_; params.cublas_handle=GetCublasHandle(); + params.is_act = !params.attn.position_bias; + params.has_beta = !params.attn.position_bias; + params.has_bias = !params.attn.position_bias; + params.ffn_bias = !params.attn.position_bias; params.attn1.head_num = params.head_num; params.attn1.head_size = params.head_size; @@ -128,7 +132,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.hidden_size = params.hidden_size; params.attn2.position_bias = decoder_op->get_position_bias2(); params.attn2.qkv_bias = !params.attn2.position_bias; - params.attn2.projection_bias = true; + params.attn2.projection_bias = !params.attn2.position_bias; params.attn2.is_cross = true; params.attn2.cublas_handle=GetCublasHandle(); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 9873f42d56a..bf7c69a03df 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -119,12 +119,12 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.has_beta = !params.attn.position_bias; params.has_bias = !params.attn.position_bias; params.ffn_bias = !params.attn.position_bias; - + params.is_act = !params.attn.position_bias; auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; size_t end_fp16 = (params.layernorm_post) ? C11NUM : C13NUM; - if (params.position_bias) { + if (params.attn.position_bias) { start_fp16 = C5NUM; end_fp16 = C8NUM; } diff --git a/trc/transformer/T5/transformer.py b/trc/transformer/T5/transformer.py index 9c8d5337bf2..cd0ee0e66f4 100644 --- a/trc/transformer/T5/transformer.py +++ b/trc/transformer/T5/transformer.py @@ -444,7 +444,7 @@ class FeedForward(Cell): self.mapping = _Linear(in_channels=input_size, out_channels=output_size, has_bias=has_bias, - activation=hidden_act, + activation=None, transpose_b=False, expert_num=expert_num, param_init_type=param_init_type) diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 370f5e1aba9..383ce7641ff 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_encoder_layer_t5,128];token_type_ids:[transformer_encoder_layer_t5,128];input_mask:[transformer_encoder_layer_t5,128] +input_shape=input_ids:[transformer_decoder_layer_t5,128];token_type_ids:[transformer_decoder_layer_t5,128];input_mask:[transformer_decoder_layer_t5,128] diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index c9e06bb35eb..aac74d7d54d 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -12,7 +12,7 @@ #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert -- Gitee From 5fccfd2f8f06de8210aead3e5551c657583d91f7 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Wed, 4 Jan 2023 12:48:10 +0200 Subject: [PATCH 16/39] encoder t5 --- .../src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc | 8 ++++++-- .../src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h | 4 ++-- trc/transformer/MultiHeadTester.py | 2 +- trc/transformer/T5/transformer.py | 4 ++-- trc/transformer/cfg_bert.config | 2 +- trc/transformer/ftBench.py | 2 -- trc/transformer/train_transformer_export.py | 7 +++---- 7 files changed, 15 insertions(+), 14 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index a528e460b4d..b41bfe2bd26 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -111,16 +111,20 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn.head_num = encoder_op->get_head_num(); params.attn.head_size = encoder_op->get_head_size(); params.attn.cublas_handle = GetCublasHandle(); - params.attn.projection_bias = false; + params.attn.projection_bias = !params.attn.position_bias; params.attn.hidden_size = params.head_num * params.head_size; params.attn.is_cross = false; params.attn.position_bias = encoder_op->get_position_bias(); params.attn.qkv_bias = !params.attn.position_bias; + params.has_beta = !params.attn.position_bias; + params.has_bias = !params.attn.position_bias; + params.ffn_bias = !params.attn.position_bias; + auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; size_t end_fp16 = (params.layernorm_post) ? C11NUM : C13NUM; - if (params.position_bias) { + if (params.attn.position_bias) { start_fp16 = C5NUM; end_fp16 = C8NUM; } diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index 7538b078d16..5fce622ae63 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -22,8 +22,8 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" -#include "src/fastertransformer/layers/encoder_layers/encoder.h" -#include "src/fastertransformer/layers/ms_layers/param.h" +#include "src/fastertransformer/layers/ms_layers/encoder.h" +// #include "src/fastertransformer/layers/ms_layers/param.h" namespace mindspore::lite { class EncoderTensorRT : public TensorRTOp { diff --git a/trc/transformer/MultiHeadTester.py b/trc/transformer/MultiHeadTester.py index 97feeecc433..cd71b57528a 100755 --- a/trc/transformer/MultiHeadTester.py +++ b/trc/transformer/MultiHeadTester.py @@ -887,7 +887,7 @@ class FeedForwardX(Cell): # Project to ffn_hidden_size self.mapping = _Linear(in_channels=input_size, out_channels=output_size, - activation=hidden_act, + activation=None, transpose_b=False, # expert_num=expert_num, # expert_group_size=expert_group_size, diff --git a/trc/transformer/T5/transformer.py b/trc/transformer/T5/transformer.py index 9c8d5337bf2..86fb98b14e1 100644 --- a/trc/transformer/T5/transformer.py +++ b/trc/transformer/T5/transformer.py @@ -415,7 +415,7 @@ class FeedForward(Cell): def __init__(self, hidden_size, ffn_hidden_size, dropout_rate, - hidden_act='gelu', + hidden_act=None, has_bias=True, expert_num=1, param_init_type=mstype.float32, @@ -444,7 +444,7 @@ class FeedForward(Cell): self.mapping = _Linear(in_channels=input_size, out_channels=output_size, has_bias=has_bias, - activation=hidden_act, + activation=None, transpose_b=False, expert_num=expert_num, param_init_type=param_init_type) diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 99e4f5bd9ab..370f5e1aba9 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_encoder_layer,128];token_type_ids:[transformer_encoder_layer,128];input_mask:[transformer_encoder_layer,128] +input_shape=input_ids:[transformer_encoder_layer_t5,128];token_type_ids:[transformer_encoder_layer_t5,128];input_mask:[transformer_encoder_layer_t5,128] diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 8e80cbdf911..c5f93a1768f 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -102,8 +102,6 @@ for line_model_arg in models_arg: if ret != 0: exit() input_files='' output_file='' - # os.system(f"./convert_fp32.sh {model_name}_fwd.mindir") - # find_output_name(f'convv_{model_name}_fwd.mindir', f'{model_name}_output.txt') if app=='ch': ret=0 if act == 'be': diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index c5744e40ba8..d8eb3f1787a 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -342,7 +342,6 @@ def transformer_encoder_layer_create(): saveT(y, name + "_output1.fp" + suffix) def transformer_encoder_layer_t5_create(): - post_layernorm=False name = "transformer_encoder_layer_t5" model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, num_heads=head_num, post_layernorm_residual=post_layernorm, has_bias=False) @@ -383,14 +382,14 @@ def transformer_encoder_layer_t5_create(): encoder_input_mask, pos) y = model(encoder_input_value, encoder_input_mask, position_bias = pos) - + print('name=',name) export(model, encoder_input_value, encoder_input_mask, pos, file_name= name + "_fwd", file_format='MINDIR') if app=="ch": f_y=open(f'./{name}_output.txt','w') out_name='output1' - print("name output:",out_name) + print(y.shape) saveCalib(out_name, np.array(y), f_y) - print("y.shape",np.array(y).shape) + f_y.close() # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims elif app=="trc": -- Gitee From 562889e43715d10f6aae188a161e73adfcfbdbfb Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Mon, 9 Jan 2023 11:54:04 +0200 Subject: [PATCH 17/39] for merge --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 46 ++++---- .../delegate/tensorrt/op/encoder_tensorrt.cc | 11 +- .../delegate/tensorrt/op/mha_tensorrt.cc | 4 +- .../optimizer/fusion/decoder_layer_fusion.cc | 103 +++++++++--------- .../optimizer/fusion/encoder_layer_fusion.cc | 10 +- trc/transformer/MultiHeadTester.py | 6 +- trc/transformer/T5/transformer.py | 29 +++-- trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 2 +- trc/transformer/ftBench.py | 4 +- trc/transformer/models.txt | 31 +++++- trc/transformer/t.config | 2 +- trc/transformer/train_transformer_export.py | 21 ++-- 13 files changed, 150 insertions(+), 121 deletions(-) mode change 100755 => 100644 mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc old mode 100755 new mode 100644 index f79c9eb0c48..90ef99ee093 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -31,7 +31,6 @@ #include "src/fastertransformer/kernels/layernorm_kernels.h" namespace mindspore::lite { - namespace { constexpr std::size_t kTwo = 2; constexpr std::size_t kThree = 3; @@ -111,11 +110,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.eps3 = decoder_op->get_eps_layernorm3(); params.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); params.ffn_fp16 = is_ffn_fp16_; - params.cublas_handle=GetCublasHandle(); - params.is_act = !params.attn.position_bias; - params.has_beta = !params.attn.position_bias; - params.has_bias = !params.attn.position_bias; - params.ffn_bias = !params.attn.position_bias; + params.cublas_handle = GetCublasHandle(); params.attn1.head_num = params.head_num; params.attn1.head_size = params.head_size; @@ -125,7 +120,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn1.projection_bias = !params.attn1.position_bias; params.attn1.is_cross = false; - params.attn1.cublas_handle=GetCublasHandle(); + params.attn1.cublas_handle = GetCublasHandle(); params.attn2.head_num = params.head_num; params.attn2.head_size = params.head_size; @@ -134,12 +129,17 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.qkv_bias = !params.attn2.position_bias; params.attn2.projection_bias = !params.attn2.position_bias; params.attn2.is_cross = true; - params.attn2.cublas_handle=GetCublasHandle(); - + params.attn2.cublas_handle = GetCublasHandle(); + params.is_act = !params.attn1.position_bias; + params.has_beta = !params.attn1.position_bias; + params.has_bias = !params.attn1.position_bias; + params.ffn_bias = !params.attn1.position_bias; + std::cout << "params.attn1.position_bias: " << params.attn1.position_bias + << "params.attn2.position_bias: " << params.attn2.position_bias << std::endl; auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { - size_t start_fp16 = (params.attn1.position_bias)? C14NUM : C18NUM; - size_t end_fp16 = (params.attn1.position_bias)? C17NUM :C22NUM; + size_t start_fp16 = (params.attn1.position_bias) ? C13NUM : C18NUM; + size_t end_fp16 = (params.attn1.position_bias) ? C16NUM : C22NUM; for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); @@ -155,7 +155,8 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } } nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; - auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + auto plugin = + std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -201,14 +202,14 @@ int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, params_.attn1.algo = algoId; params_.attn2.stream = stream; params_.attn2.algo = algoId; - void *inputs_forward[num_of_inputs_]; - for (int i=0; i < num_of_inputs_; i++){ - inputs_forward[i]=const_cast(inputs[i]); + void *inputs_forward[num_of_inputs_]; + for (int i = 0; i < num_of_inputs_; i++) { + inputs_forward[i] = const_cast(inputs[i]); } void *outputs_forward[] = {outputs[0]}; - fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, - workspace); -return RET_OK; + fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, + workspace); + return RET_OK; } bool DecoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, @@ -222,7 +223,7 @@ bool DecoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTen } void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, - const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { const int request_batch_size = static_cast(in[0].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); const int request_tgt_seq_len = request_src_seq_len; @@ -240,7 +241,6 @@ void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, } size_t DecoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { - if (compute_type_ == RuntimePrecisionMode_FP16) { return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); } else { @@ -276,11 +276,13 @@ nvinfer1::IPluginV2DynamicExt *DecoderPlugin::clone() const noexcept { return plugin; } -size_t DecoderPlugin::getSerializationSize() const noexcept { return sizeof(int) + sizeof(fastertransformer::decoderParamT); } +size_t DecoderPlugin::getSerializationSize() const noexcept { + return sizeof(int) + sizeof(fastertransformer::decoderParamT); +} void DecoderPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::decoderParamT)); } REGISTER_TENSORRT_CREATOR(ops::kNameDecoderLayer, DecoderTensorRT) -} // namespace mindspore::lite \ No newline at end of file +} // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index bf7c69a03df..740c03f9b33 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -111,10 +111,11 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn.head_num = encoder_op->get_head_num(); params.attn.head_size = encoder_op->get_head_size(); params.attn.cublas_handle = GetCublasHandle(); - params.attn.projection_bias = !params.attn.position_bias; params.attn.hidden_size = params.head_num * params.head_size; params.attn.is_cross = false; params.attn.position_bias = encoder_op->get_position_bias(); + params.attn.projection_bias = !params.attn.position_bias; + std::cout << "params.attn.position_bias" << params.attn.position_bias << std::endl; params.attn.qkv_bias = !params.attn.position_bias; params.has_beta = !params.attn.position_bias; params.has_bias = !params.attn.position_bias; @@ -125,8 +126,8 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; size_t end_fp16 = (params.layernorm_post) ? C11NUM : C13NUM; if (params.attn.position_bias) { - start_fp16 = C5NUM; - end_fp16 = C8NUM; + start_fp16 = C6NUM; + end_fp16 = C9NUM; } for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); @@ -189,8 +190,8 @@ int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, params_.attn.stream = stream; params_.attn.algo = algoId; void *inputs_forward[num_of_inputs_]; - for (int i=0; i < num_of_inputs_; i++){ - inputs_forward[i]=const_cast(inputs[i]); + for (int i = 0; i < num_of_inputs_; i++) { + inputs_forward[i] = const_cast(inputs[i]); } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardEncoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index 199ac58176c..d2f3a69aded 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -220,9 +220,9 @@ void MhaPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int size_t MhaPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { if (compute_type_ == RuntimePrecisionMode_FP16) { - return fastertransformer::GetAttnWorkspaceSize(¶ms_); + return fastertransformer::GetAttnWorkspaceSizeByOptAllocator(¶ms_); } else { - return fastertransformer::GetAttnWorkspaceSize(¶ms_); + return fastertransformer::GetAttnWorkspaceSizeByOptAllocator(¶ms_); } } diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index ac30ffd3e7c..205a0482d0f 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -150,25 +150,24 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr auto var1 = std::make_shared("var1-reshape"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); - VectorRef attention, attention_cross, add2, tuple2, tuple3, layer_norm3, add3, reshape4, matmul2, tuple4, tuple5; + VectorRef attention, attention_cross, add2, tuple2, tuple3, add3, reshape4, matmul2, tuple4, tuple5; if (is_position_bias) { attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, position_bias_,mask_}); + weight_attn_o_, position_bias_, mask_}); } else { attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, bias_attn_qkv_, bias_attn_o_, mask_}); } - if (is_position_bias){ - tuple4=attention; + if (is_position_bias) { + tuple4 = attention; } else { - auto is_tuple4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item4"); - auto var_tuple4 = std::make_shared("var_tuple4"); - tuple4 = VectorRef({is_tuple4, attention, var_tuple4}); - + auto is_tuple4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item4"); + auto var_tuple4 = std::make_shared("var_tuple4"); + tuple4 = VectorRef({is_tuple4, attention, var_tuple4}); } auto is_add2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add2"); if (post_layernorm) { @@ -194,15 +193,14 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr weight_attn_cross_o_, position_bias_cross_, cross_mask_}); } else { attention_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, - weight_attn_cross_o_,bias_attn_cross_qkv_, bias_attn_cross_o_, cross_mask_}); + weight_attn_cross_o_, bias_attn_cross_qkv_, bias_attn_cross_o_, cross_mask_}); } if (is_position_bias) { - tuple5 =attention_cross; + tuple5 = attention_cross; } else { - - auto is_tuple5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item5"); - auto var_tuple5 = std::make_shared("var_tuple5"); - tuple5 = VectorRef({is_tuple5, attention_cross, var_tuple5}); + auto is_tuple5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item5"); + auto var_tuple5 = std::make_shared("var_tuple5"); + tuple5 = VectorRef({is_tuple5, attention_cross, var_tuple5}); } auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); MS_CHECK_TRUE_RET(is_add2 != nullptr, {}); @@ -577,44 +575,43 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); std::vector new_node_inputs; if (is_position_bias_) { - new_node_inputs = { value_node, input, gamma1, weight_qkv, position_bias, - input_mask, weight_attn_o, gamma2, encoder_output, weight_attn_q, weight_attn_kv, position_bias_cross, cross_mask, - weight_attn_cross_o, gamma3, weight_m, weight_p - }; -} -else { - new_node_inputs = {value_node, - input, - gamma1, - beta1, - weight_qkv, - bias_attn_qkv, - input_mask, - weight_attn_o, - bias_attn_o, - gamma2, - beta2, - encoder_output, - weight_attn_q, - weight_attn_kv, - bias_attn_cross_qkv, - cross_mask, - weight_attn_cross_o, - bias_attn_cross_o, - gamma3, - beta3, - weight_m, - bias_m, - weight_p, - bias_p}; -} -auto new_node = func_graph->NewCNode(new_node_inputs); -MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); -auto old_node = node->cast(); -MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); -new_node->set_abstract(old_node->abstract()->Clone()); -new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); + new_node_inputs = { + value_node, input, gamma1, weight_qkv, input_mask, position_bias, weight_attn_o, + gamma2, encoder_output, weight_attn_q, weight_attn_kv, cross_mask, position_bias_cross, weight_attn_cross_o, + gamma3, weight_m, weight_p}; + } else { + new_node_inputs = {value_node, + input, + gamma1, + beta1, + weight_qkv, + bias_attn_qkv, + input_mask, + weight_attn_o, + bias_attn_o, + gamma2, + beta2, + encoder_output, + weight_attn_q, + weight_attn_kv, + bias_attn_cross_qkv, + cross_mask, + weight_attn_cross_o, + bias_attn_cross_o, + gamma3, + beta3, + weight_m, + bias_m, + weight_p, + bias_p}; + } + auto new_node = func_graph->NewCNode(new_node_inputs); + MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); + auto old_node = node->cast(); + MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); + new_node->set_abstract(old_node->abstract()->Clone()); + new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); -return new_node; + return new_node; +} // namespace mindspore::opt } // namespace mindspore::opt -} // namespace mindspore::opt \ No newline at end of file diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index 0bb89fe03a7..15453d4a962 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -242,7 +242,7 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - if (pattern_name == kPatternTEncoderLayerPostNorm || pattern_name == kPatternTEncoderLayerPreNorm) + if (pattern_name == kPatternTEncoderLayerPostNorm || pattern_name == kPatternTEncoderLayerPreNorm) is_layernorm_fusion_ = true; if (pattern_name == kPatternTEncoderLayerPost || pattern_name == kPatternTEncoderLayerPostNorm) { return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true); @@ -391,11 +391,11 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP if (is_position_bias_) { position_bias = utils::cast((*equiv)[position_bias_]); if (!post_layernorm) - new_node_inputs = {value_node, input, gamma1, weight_qkv, input_mask, - weight_attn_o, gamma2, weight_m, weight_p, position_bias}; + new_node_inputs = {value_node, input, gamma1, weight_qkv, input_mask, + position_bias, weight_attn_o, gamma2, weight_m, weight_p}; else - new_node_inputs = {value_node, input, weight_qkv, input_mask, weight_attn_o, - gamma1, weight_m, weight_p, gamma2, position_bias}; + new_node_inputs = {value_node, input, weight_qkv, input_mask, position_bias, + weight_attn_o, gamma1, weight_m, weight_p, gamma2}; } else { if (!post_layernorm) { new_node_inputs = {value_node, input, gamma1, beta1, weight_qkv, bias_attn_qkv, input_mask, weight_attn_o, diff --git a/trc/transformer/MultiHeadTester.py b/trc/transformer/MultiHeadTester.py index 97feeecc433..d9fce7a3bf1 100755 --- a/trc/transformer/MultiHeadTester.py +++ b/trc/transformer/MultiHeadTester.py @@ -858,9 +858,9 @@ class FeedForwardX(Cell): param_init_type=mstype.float32, parallel_config=default_dpmp_config): super(FeedForwardX, self).__init__() - if hidden_act is None or not (isinstance(hidden_act, str) or issubclass(hidden_act, nn.Cell)): - raise TypeError(f"For FeedForward cell, the hidden_act should str type or nn.Cell type, " - f"but got {hidden_act}.") + # if hidden_act is None or not (isinstance(hidden_act, str) or issubclass(hidden_act, nn.Cell)): + # raise TypeError(f"For FeedForward cell, the hidden_act should str type or nn.Cell type, " + # f"but got {hidden_act}.") if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation(): _check_config(parallel_config) mp = parallel_config.model_parallel diff --git a/trc/transformer/T5/transformer.py b/trc/transformer/T5/transformer.py index cd0ee0e66f4..f762c39ee8a 100644 --- a/trc/transformer/T5/transformer.py +++ b/trc/transformer/T5/transformer.py @@ -407,7 +407,7 @@ class FeedForward(Cell): @_args_type_validator_check(hidden_size=Validator.check_positive_int, ffn_hidden_size=Validator.check_positive_int, dropout_rate=Validator.check_non_negative_float, - hidden_act=_valid_type_checks([str], "FeedForward"), + # hidden_act=_valid_type_checks([str], "FeedForward"), param_init_type=_valid_value_checks([mstype.float32, mstype.float16], "FeedForward"), parallel_config=_valid_type_checks([OpParallelConfig], @@ -444,7 +444,7 @@ class FeedForward(Cell): self.mapping = _Linear(in_channels=input_size, out_channels=output_size, has_bias=has_bias, - activation=None, + activation=hidden_act, transpose_b=False, expert_num=expert_num, param_init_type=param_init_type) @@ -804,7 +804,8 @@ class MultiHeadAttention(Cell): use_past=False, is_decoder=False, has_relative_attention_bias=False, - parallel_config=default_dpmp_config): + parallel_config=default_dpmp_config, + num_outputs=1): super(MultiHeadAttention, self).__init__() _check_config(parallel_config) self.is_parallel_mode = _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) @@ -812,6 +813,7 @@ class MultiHeadAttention(Cell): self.tgt_seq_length = tgt_seq_length self.hidden_size = hidden_size self.batch_size = batch_size + self.num_outputs = num_outputs if hidden_dropout_rate < 0 or hidden_dropout_rate >= 1: raise ValueError("For 'MultiHeadAttention', the class variable 'hidden_dropout_rate' must be " "in range [0, 1.0), but got the value : {}.".format(hidden_dropout_rate)) @@ -1016,11 +1018,11 @@ class MultiHeadAttention(Cell): output = self.projection(attention) output = self.dropout(output) output = F.reshape(output, ori_shape) - # if self.app=="trc": + if self.num_outputs==1: + return output return output, layer_present, position_bias # else: - # return output - # return output + # return output def _check_inputs(self, query_tensor, key_tensor, value_tensor, attention_mask, key_past=None, value_past=None, batch_valid_length=None): @@ -1321,7 +1323,7 @@ class TransformerEncoderLayer(Cell): seq_length=Validator.check_positive_int, attention_dropout_rate=Validator.check_non_negative_float, hidden_dropout_rate=Validator.check_non_negative_float, - hidden_act=_valid_type_checks([str], "TransformerEncoderLayer"), + # hidden_act=_valid_type_checks([str], "TransformerEncoderLayer"), post_layernorm_residual=Validator.check_bool, layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16], "TransformerEncoderLayer"), @@ -1387,7 +1389,8 @@ class TransformerEncoderLayer(Cell): use_past=use_past, is_decoder=False, has_relative_attention_bias=has_relative_attention_bias, - parallel_config=parallel_config) + parallel_config=parallel_config, + num_outputs=3) _check_moe_config(moe_config, parallel_config) self.use_moe = (moe_config.expert_num > 1) if self.use_moe is True: @@ -1464,6 +1467,7 @@ class TransformerEncoderLayer(Cell): mlp_logit, aux_loss = self.output(output_x) else: mlp_logit = self.output(output_x) + # return mlp_logit value_update = None key_update = None @@ -1627,7 +1631,7 @@ class TransformerDecoderLayer(Cell): tgt_seq_length=Validator.check_positive_int, attention_dropout_rate=Validator.check_non_negative_float, hidden_dropout_rate=Validator.check_non_negative_float, - hidden_act=_valid_type_checks([str], "TransformerDecoderLayer"), + # hidden_act=_valid_type_checks([str], "TransformerDecoderLayer"), post_layernorm_residual=Validator.check_bool, layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16], "TransformerDecoderLayer"), @@ -1702,7 +1706,8 @@ class TransformerDecoderLayer(Cell): param_init_type=param_init_type, is_decoder=True, has_relative_attention_bias=has_relative_attention_bias, - parallel_config=parallel_config) + parallel_config=parallel_config, + num_outputs=3) # Cross attention with the output of encoder as memory tensor self.cross_attention = MultiHeadAttention(hidden_size=hidden_size, num_heads=num_heads, @@ -1717,7 +1722,8 @@ class TransformerDecoderLayer(Cell): param_init_type=param_init_type, is_decoder=True, has_relative_attention_bias=has_relative_attention_bias, - parallel_config=parallel_config) + parallel_config=parallel_config, + num_outputs=3) self.cross_attention_layernorm = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type) self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),)) @@ -1788,6 +1794,7 @@ class TransformerDecoderLayer(Cell): attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, decoder_mask, position_bias, self.key_past, self.value_past, batch_valid_length) + # return attention # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm if self.post_layernorm_residual: x = self.add(input_x, attention) diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 383ce7641ff..370f5e1aba9 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_decoder_layer_t5,128];token_type_ids:[transformer_decoder_layer_t5,128];input_mask:[transformer_decoder_layer_t5,128] +input_shape=input_ids:[transformer_encoder_layer_t5,128];token_type_ids:[transformer_encoder_layer_t5,128];input_mask:[transformer_encoder_layer_t5,128] diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 6a3cda3ef60..728b521ac2d 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -5,7 +5,7 @@ version=$(cat ${base}/version.txt) system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 benchmark=${system}/tools/benchmark/benchmark server=caspi -gpu_id=2 +gpu_id=5 # move files to caspi model=${1%.mindir} model=${model#convv_} diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 8e80cbdf911..a4a5c5f3b85 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -13,13 +13,13 @@ system = f'{base}/trc/system_test/release/ubuntu_x86/mindspore-lite-{version}-li benchmark = f'{system}/tools/benchmark' work_dir=f'{base}/trc/transformer' image = "private_transformer:0.1" -server = "10.10.10.174" +server = "caspi" enable_fp16 = "false" suffix="fp32" usage='enter the correct parameters: app=ch\\trc, act=runtime\\be, loop count=int>=0, server=local\\num of server\nif app=trc and act=be loop count must be 1' app='ch' act='be' -cuda_visible_dev=2 +cuda_visible_dev=5 loop_count=1 if len(sys.argv)>2 or len(sys.argv)==1: parameters=sys.argv[1:] diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index aac74d7d54d..553579330bd 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,9 +1,24 @@ +#not work +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer_t5 +# +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 + +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +# +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 + #run the following tests before push + #-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross #-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross #-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 #-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross + #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer @@ -11,8 +26,18 @@ #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 + +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 + +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert @@ -21,9 +46,11 @@ #-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer # #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 0ecc92cc5ec..d719bfb8f65 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,4 +1,4 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" -#fusion_blacklists="EncoderLayerFusion" +fusion_blacklists="EncoderLayerFusion" #fusion_blacklists="DecoderLayerFusion" diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 2b8dcd72713..f5815623846 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -218,7 +218,7 @@ def read_args(): else: if sys.argv[i + 1]=='0': ffn_fp16 = False - else: + elif sys.argv[i + 1]=='1': ffn_fp16 = True size_per_head=hid_size//head_num tgt_seq_len = tgt_seq_len if (tgt_seq_len != -1) else seq @@ -345,10 +345,10 @@ def transformer_encoder_layer_t5_create(): if (post_layernorm): print("post_layernorm") model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, - num_heads=head_num, post_layernorm_residual=True, has_bias=False) + num_heads=head_num, post_layernorm_residual=True, has_bias=False, hidden_act=None) else: model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, - num_heads=head_num, has_bias=False) + num_heads=head_num, has_bias=False, hidden_act=None) encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) @@ -395,7 +395,7 @@ def transformer_encoder_layer_t5_create(): saveCalib(out_name, np.array(y), f_y) print("y.shape",np.array(y).shape) # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims - + f_y.close() # elif app=="trc": saveT(y, name + "_output1.fp" + suffix) @@ -405,11 +405,11 @@ def transformer_decoder_layer_t5_create(): if (post_layernorm): print("post_layernorm true") model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False) + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False, hidden_act=None) else: print("post_layernorm false") model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False) + tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False, hidden_act=None) hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) @@ -438,27 +438,22 @@ def transformer_decoder_layer_t5_create(): opw = model.output.projection.weight gl1 = model.layernorm1.weight - # bl1 = model.layernorm1.beta gl2 = model.layernorm2.weight - # bl2 = model.layernorm2.beta gl3 = model.cross_attention_layernorm.weight - # bl3 = model.cross_attention_layernorm.beta suffix = str(compute_type) suffix = suffix[-2:] saveT(gl1, name + "_weight1.fp" + suffix) - # saveT(bl1, name + "_weight2.fp" + suffix) saveT(wt, name + "_weight2.fp" + suffix) saveT(wp, name + "_weight3.fp" + suffix) saveT(gl2, name + "_weight4.fp" + suffix) - # saveT(bl2, name + "_weight8.fp" + suffix) saveT(qt2, name + "_weight5.fp" + suffix) saveT(wt2, name + "_weight6.fp" + suffix) saveT(wp2, name + "_weight7.fp" + suffix) saveT(gl3, name + "_weight8.fp" + suffix) if(ffn_fp16): - saveT(omw, name + "_weight9.fp" + "16") - saveT(opw, name + "_weight10.fp" + "16") + saveTensorToHalf(omw, name + "_weight9.fp" + "16") + saveTensorToHalf(opw, name + "_weight10.fp" + "16") else: saveT(omw, name + "_weight9.fp" + suffix) saveT(opw, name + "_weight10.fp" + suffix) -- Gitee From 1ba1637358ec5977d67709213d3c7eeddf34aaf4 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Mon, 9 Jan 2023 17:59:46 +0200 Subject: [PATCH 18/39] get eps from layernorm --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 2 +- .../optimizer/fusion/decoder_layer_fusion.cc | 393 +++++++----------- .../optimizer/fusion/decoder_layer_fusion.h | 9 +- .../optimizer/fusion/encoder_layer_fusion.cc | 43 +- .../optimizer/fusion/encoder_layer_fusion.h | 5 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/models.txt | 24 +- 7 files changed, 202 insertions(+), 276 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index d76cab1d512..e24a2b30522 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -285,4 +285,4 @@ void DecoderPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::decoderParamT)); } REGISTER_TENSORRT_CREATOR(ops::kNameDecoderLayer, DecoderTensorRT) -} // namespace mindspore::lite \ No newline at end of file +} // namespace mindspore::lite diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 19b50030c05..96afb3dcb65 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -132,8 +132,6 @@ VectorRef DecoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, Var auto is_reduce = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce"); MS_CHECK_TRUE_RET(is_reduce != nullptr, {}); auto reduce = VectorRef({is_reduce, sqr, var1}); - // auto var2 = std::make_shared("var3"); - // MS_CHECK_TRUE_RET(var2 != nullptr, {}); auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is-add"); MS_CHECK_TRUE_RET(is_add != nullptr, {}); auto add = VectorRef({is_add, reduce, eps}); @@ -156,8 +154,7 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr auto var1 = std::make_shared("var1-reshape"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); - return getTuple(post_layernorm, layernorm_fusion, is_position_bias); - VectorRef attention, attention_cross, add2, tuple2, tuple3, add3, reshape4, matmul2, tuple4, tuple5; + VectorRef attention, attention_cross, tuple2, tuple3, matmul2, tuple4, tuple5; if (is_position_bias) { attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), @@ -177,11 +174,9 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr tuple4 = VectorRef({is_tuple4, attention, var_tuple4}); } auto is_add2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add2"); - if (post_layernorm) { - add2 = VectorRef({is_add2, getTuple(post_layernorm, layernorm_fusion, is_position_bias), tuple4}); - } else { - add2 = VectorRef({is_add2, reshape1, tuple4}); - } + auto add2 = (post_layernorm) + ? VectorRef({is_add2, getTuple(post_layernorm, layernorm_fusion, is_position_bias), tuple4}) + : VectorRef({is_add2, reshape1, tuple4}); if (layernorm_fusion) { auto layer_norm2 = VectorRef({is_layernorm2_, add2, gamma2_, beta2_}); auto is_tuple2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item2"); @@ -211,12 +206,7 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr } auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); MS_CHECK_TRUE_RET(is_add2 != nullptr, {}); - if (post_layernorm) { - add3 = VectorRef({is_add3, tuple2, tuple5}); - } else { - add3 = VectorRef({is_add3, add2, tuple5}); - } - // return add3; + auto add3 = (post_layernorm) ? VectorRef({is_add3, tuple2, tuple5}) : VectorRef({is_add3, add2, tuple5}); if (layernorm_fusion) { auto layer_norm3 = VectorRef({is_layernorm3_, add3, gamma3_, beta3_}); auto is_tuple3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item3"); @@ -235,7 +225,6 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr matmul2 = VectorRef({is_matmul2, act, weight_p_, bias_p_}); } else { auto matmul1 = VectorRef({is_matmul1, tuple3, weight_m_}); - auto act = VectorRef({is_act_, matmul1}); matmul2 = VectorRef({is_matmul2, matmul1, weight_p_}); } auto is_reshape3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder3"); @@ -247,39 +236,10 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr MS_CHECK_TRUE_RET(is_reshape4 != nullptr, {}); auto var4 = std::make_shared("var4"); MS_CHECK_TRUE_RET(var4 != nullptr, {}); - if (post_layernorm) { - reshape4 = VectorRef({is_reshape4, tuple3, var4}); - } else { - reshape4 = VectorRef({is_reshape4, add3, var4}); - } + auto reshape4 = (post_layernorm) ? VectorRef({is_reshape4, tuple3, var4}) : VectorRef({is_reshape4, add3, var4}); auto is_add4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add4"); auto add4 = VectorRef({is_add4, reshape4, reshape3}); - // if (!post_layernorm || layernorm_fusion) { - // return add4; - // } return add4; - // auto is_reshape4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); - // MS_CHECK_TRUE_RET(is_reshape4 != nullptr, {}); - // auto var4 = std::make_shared("var4"); - // MS_CHECK_TRUE_RET(var4 != nullptr, {}); - // auto reshape4 = VectorRef({is_reshape4, add3, var4}); - // VectorRef layer_norm, tuple; - // if (layernorm_fusion) { - // layer_norm = DefineLayerNorm(reshape4, gamma1_, beta1_); - // tuple = layer_norm; - // } else { - // layer_norm = VectorRef({is_layernorm1_, reshape4, gamma1_, beta1_}); - - // auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); - // auto var_tuple = std::make_shared("var_tuple"); - // tuple = VectorRef({is_tuple, layer_norm, var_tuple}); - // } - // auto is_reshape5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); - // MS_CHECK_TRUE_RET(is_reshape5 != nullptr, {}); - // auto var5 = std::make_shared("var5"); - // MS_CHECK_TRUE_RET(var5 != nullptr, {}); - // auto reshape5 = VectorRef({is_reshape5, tuple, var5}); - // return reshape5; } std::unordered_map DecoderLayerFusion::DefinePatterns() const { @@ -300,7 +260,6 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - std::cout << "found pattern=" << pattern_name << std::endl; if (pattern_name == kPatternDecoderT5Pre || pattern_name == kPatternDecoderT5Post) { is_position_bias_ = true; } else if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderLayerPost) { @@ -313,6 +272,7 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi } return nullptr; } + bool DecoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const { auto act_input = GetAttribute(func_graph, equiv, is_act_); MS_ASSERT(act_input != nullptr); @@ -325,6 +285,7 @@ bool DecoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPt } return true; } + AnfNodePtr DecoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const { if ((*equiv)[node_name] == nullptr || !utils::isa((*equiv)[node_name])) { @@ -353,73 +314,51 @@ AnfNodePtr DecoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, cons return input; } -STATUS GetIntParameterData(const ParameterPtr ¶m_ptr, std::vector *result) { - if (param_ptr == nullptr || !param_ptr->has_default()) { - MS_LOG(DEBUG) << "param not have default"; - return RET_ERROR; - } - auto default_param = param_ptr->default_param(); - if (default_param == nullptr || !utils::isa(default_param)) { - MS_LOG(DEBUG) << "tensor_info is not tensor::TensorPtr"; - return RET_ERROR; - } - auto default_param_ptr = utils::cast(default_param); - if (default_param_ptr->data_type() != kNumberTypeInt32 && default_param_ptr->data_type() != kNumberTypeInt) { - MS_LOG(DEBUG) << "default param is not int"; +STATUS DecoderLayerFusion::GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const{ + if ((*equiv)[node_name] == nullptr || !utils::isa((*equiv)[node_name])) { + MS_LOG(ERROR) << node_name << " is not anfnodeptr"; return RET_ERROR; } - auto ptr = reinterpret_cast(default_param_ptr->data_c()); - int64_t shape_size = - std::accumulate(default_param_ptr->shape().begin(), default_param_ptr->shape().end(), 1, std::multiplies<>()); - for (int64_t i = 0; i < shape_size; i++) { - result->emplace_back(ptr[i]); - } - return RET_OK; -} - -STATUS GetAxis(const ValueNodePtr n, std::vector *axes) { - if (utils::isa(n)) { - auto axes_value_node = utils::cast(n); - *axes = CastToFloat(axes_value_node->value()); - std::cout << "eps=" << *axes << std::endl; - return lite::RET_OK; - } else { - // auto reshape = utils::cast(n); - // if (reshape != nullptr) { - // if (GetIntParameterData(reshape, axes) == lite::RET_OK) { - // return lite::RET_OK; - // } - // } + AnfNodePtr node = utils::cast((*equiv)[node_name]); + MS_ASSERT(node != nullptr); + if (utils::isa(node)) { + auto value_ptr_node = utils::cast(node); + auto value_node = utils::cast(value_ptr_node->value()); + if (value_node->isa()) { + auto tensor = value_node->cast(); + MS_EXCEPTION_IF_NULL(tensor); + *eps = *reinterpret_cast(tensor->data().data()); + return RET_OK; + } } - MS_LOG(ERROR) << " cannot get axes data"; - return lite::RET_ERROR; + return RET_ERROR; } STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1, bool *is_position_bias2) const { - // auto attn_input = GetAttribute(func_graph, equiv, is_attention_); - // MS_ASSERT(attn_input != nullptr); - // auto attn_prim = ops::GetOperator(attn_input); - // if (attn_prim->GetAttr(ops::kEncoderLayerNumHeads) != nullptr) { - // *head_num = attn_prim->get_head_num(); - // } - // if (attn_prim->GetAttr(ops::kAttentionSizePerHead) != nullptr) { - // *head_size = attn_prim->get_head_size(); - // } - // if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { - // *is_position_bias1 = attn_prim->get_position_bias(); - // } - // if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { - // MS_LOG(ERROR) << "is_attention_ is not AnfNodePtr"; - // return RET_ERROR; - // } - // auto attn_cross_input = GetAttribute(func_graph, equiv, is_attention_cross_); - // MS_ASSERT(attn_cross_input != nullptr); - // auto attn_cross_prim = ops::GetOperator(attn_cross_input); - // if (attn_cross_prim->GetAttr(ops::kPositionBias) != nullptr) { - // *is_position_bias2 = attn_cross_prim->get_position_bias(); - // } + auto attn_input = GetAttribute(func_graph, equiv, is_attention_); + MS_ASSERT(attn_input != nullptr); + auto attn_prim = ops::GetOperator(attn_input); + if (attn_prim->GetAttr(ops::kEncoderLayerNumHeads) != nullptr) { + *head_num = attn_prim->get_head_num(); + } + if (attn_prim->GetAttr(ops::kAttentionSizePerHead) != nullptr) { + *head_size = attn_prim->get_head_size(); + } + if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { + *is_position_bias1 = attn_prim->get_position_bias(); + } + if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { + MS_LOG(ERROR) << "is_attention_ is not AnfNodePtr"; + return RET_ERROR; + } + auto attn_cross_input = GetAttribute(func_graph, equiv, is_attention_cross_); + MS_ASSERT(attn_cross_input != nullptr); + auto attn_cross_prim = ops::GetOperator(attn_cross_input); + if (attn_cross_prim->GetAttr(ops::kPositionBias) != nullptr) { + *is_position_bias2 = attn_cross_prim->get_position_bias(); + } if (is_layernorm_fusion_) { auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); auto layrn1_prim = ops::GetOperator(layrn1_input); @@ -437,70 +376,23 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq *eps3 = layrn3_prim->get_epsilon(); } } else { - // auto eps1_value_ptr = utils::cast(GetAttribute(func_graph, equiv, eps1_)); - std::vector epss; - std::cout << eps1_->ToString() << std::endl; - MS_ASSERT(axes != nullptr); - if ((*equiv)[eps1_] == nullptr || !utils::isa((*equiv)[eps1_])) { - std::cout << "is not AnfNodePtr1"; + if (GetEps(equiv, eps1_, eps1) != RET_OK) { + MS_LOG(ERROR) << "not found eps1"; return RET_ERROR; } - AnfNodePtr node = utils::cast((*equiv)[eps1_]); - MS_ASSERT(node != nullptr); - if (node == nullptr || !utils::isa(node)) { - auto manager = func_graph->manager(); - if (manager == nullptr) { - std::cout << "is not AnfNodePtr2"; - - return RET_ERROR; - } - auto users = manager->node_users(); - auto it = users.find(node); - if (it != users.end()) { - node = it->second.front().first; - } - if (node == nullptr) { // || !utils::isa(node)) { - std::cout << "is not AnfNodePtr3"; - return RET_ERROR; - } - if (node->isa()) { - auto value_node = node->cast(); - MS_EXCEPTION_IF_NULL(value_node); - auto value = value_node->value(); - MS_EXCEPTION_IF_NULL(value); - auto axes = CastToFloat(value); - std::cout << "eps=" << axes.at(0) << std::endl; - // if (value->isa()) { - // auto tensor = value->cast(); - // MS_EXCEPTION_IF_NULL(tensor); - // if (tensor->is_forward_output()) { - // std::cout<<"tensor"<data().data<(eps1_value_ptr) <(eps1_value_ptr); - // auto eps2_value_ptr = utils::cast(GetAttribute(func_graph, equiv, eps2_)); - // std::cout << GetValue(eps2_value_ptr) <(eps2_value_ptr); - - // auto eps3_value_ptr = utils::cast(GetAttribute(func_graph, equiv, eps3_)); - // *eps3=GetValue(eps3_value_ptr); } if (!is_position_bias_) { if (!IsActGELU(func_graph, equiv)) { - return false; + return RET_ERROR; } } return RET_OK; @@ -522,8 +414,6 @@ std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGrap bool is_position_bias2 = false; if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1, &is_position_bias2)) { - std::cout << "nullptr"; - return nullptr; } decoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3, ffn_hidden_size, is_position_bias1, is_position_bias2, @@ -537,93 +427,90 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); - auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, 3072); - // auto input = utils::cast((*equiv)[hidden_stats_]); - // MS_ASSERT(input != nullptr); - // auto encoder_output = utils::cast((*equiv)[encoder_output_]); - // MS_ASSERT(encoder_output != nullptr); - // AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3, - // bias_attn_cross_qkv, bias_attn_cross_o, position_bias_cross; - // auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); - // auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); - // auto weight_attn_q = utils::cast((*equiv)[weight_attn_q_]); - // auto weight_attn_kv = utils::cast((*equiv)[weight_attn_kv_]); - // auto weight_attn_cross_o = utils::cast((*equiv)[weight_attn_cross_o_]); - // auto weight_m = utils::cast((*equiv)[weight_m_]); - // auto weight_p = utils::cast((*equiv)[weight_p_]); - // if (is_position_bias_) { - // position_bias = utils::cast((*equiv)[position_bias_]); - // position_bias_cross = utils::cast((*equiv)[position_bias_cross_]); - // } else { - // bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); - // bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); - // bias_attn_cross_qkv = utils::cast((*equiv)[bias_attn_cross_qkv_]); - // bias_attn_cross_o = utils::cast((*equiv)[bias_attn_cross_o_]); - // bias_m = utils::cast((*equiv)[bias_m_]); - // bias_p = utils::cast((*equiv)[bias_p_]); - // beta1 = utils::cast((*equiv)[beta1_]); - // beta2 = utils::cast((*equiv)[beta2_]); - // beta3 = utils::cast((*equiv)[beta3_]); - // } - // auto gamma1 = utils::cast((*equiv)[gamma1_]); - // auto gamma2 = utils::cast((*equiv)[gamma2_]); - // auto gamma3 = utils::cast((*equiv)[gamma3_]); - // input_mask = utils::cast((*equiv)[mask_]); - // auto cross_mask = utils::cast((*equiv)[cross_mask_]); - // auto base_shape_ptr = weight_m->Shape(); - // MS_EXCEPTION_IF_NULL(base_shape_ptr); - // auto input_shape_ptr = base_shape_ptr->cast(); - // MS_EXCEPTION_IF_NULL(input_shape_ptr); - // auto input_shape = input_shape_ptr->shape(); - // MS_ASSERT(input_shape != nullptr); - // // int ffn_hidden_size = (int64_t)input_shape[1]; - // auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size); - // MS_CHECK_TRUE_RET(decoder_layer_prim != nullptr, nullptr); - // auto decoder_layer_prim_c = decoder_layer_prim->GetPrim(); - // MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr); - // auto value_node = NewValueNode(decoder_layer_prim_c); - // MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); - // std::vector new_node_inputs; - // if (is_position_bias_) { - // new_node_inputs = { value_node, input, gamma1, weight_qkv,input_mask, position_bias, - // weight_attn_o, gamma2, encoder_output, weight_attn_q, weight_attn_kv,cross_mask, position_bias_cross, - // weight_attn_cross_o, gamma3, weight_m, weight_p - // }; - // } - // else { - // new_node_inputs = {value_node, - // input, - // gamma1, - // beta1, - // weight_qkv, - // bias_attn_qkv, - // input_mask, - // weight_attn_o, - // bias_attn_o, - // gamma2, - // beta2, - // encoder_output, - // weight_attn_q, - // weight_attn_kv, - // bias_attn_cross_qkv, - // cross_mask, - // weight_attn_cross_o, - // bias_attn_cross_o, - // gamma3, - // beta3, - // weight_m, - // bias_m, - // weight_p, - // bias_p}; - // } - // auto new_node = func_graph->NewCNode(new_node_inputs); - // MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); - // auto old_node = node->cast(); - // MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); - // new_node->set_abstract(old_node->abstract()->Clone()); - // new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); + auto input = utils::cast((*equiv)[hidden_stats_]); + MS_ASSERT(input != nullptr); + auto encoder_output = utils::cast((*equiv)[encoder_output_]); + MS_ASSERT(encoder_output != nullptr); + AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3, + bias_attn_cross_qkv, bias_attn_cross_o, position_bias_cross; + auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); + auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); + auto weight_attn_q = utils::cast((*equiv)[weight_attn_q_]); + auto weight_attn_kv = utils::cast((*equiv)[weight_attn_kv_]); + auto weight_attn_cross_o = utils::cast((*equiv)[weight_attn_cross_o_]); + auto weight_m = utils::cast((*equiv)[weight_m_]); + auto weight_p = utils::cast((*equiv)[weight_p_]); + if (is_position_bias_) { + position_bias = utils::cast((*equiv)[position_bias_]); + position_bias_cross = utils::cast((*equiv)[position_bias_cross_]); + } else { + bias_attn_o = utils::cast((*equiv)[bias_attn_o_]); + bias_attn_qkv = utils::cast((*equiv)[bias_attn_qkv_]); + bias_attn_cross_qkv = utils::cast((*equiv)[bias_attn_cross_qkv_]); + bias_attn_cross_o = utils::cast((*equiv)[bias_attn_cross_o_]); + bias_m = utils::cast((*equiv)[bias_m_]); + bias_p = utils::cast((*equiv)[bias_p_]); + beta1 = utils::cast((*equiv)[beta1_]); + beta2 = utils::cast((*equiv)[beta2_]); + beta3 = utils::cast((*equiv)[beta3_]); + } + auto gamma1 = utils::cast((*equiv)[gamma1_]); + auto gamma2 = utils::cast((*equiv)[gamma2_]); + auto gamma3 = utils::cast((*equiv)[gamma3_]); + input_mask = utils::cast((*equiv)[mask_]); + auto cross_mask = utils::cast((*equiv)[cross_mask_]); + auto base_shape_ptr = weight_m->Shape(); + MS_EXCEPTION_IF_NULL(base_shape_ptr); + auto input_shape_ptr = base_shape_ptr->cast(); + MS_EXCEPTION_IF_NULL(input_shape_ptr); + auto input_shape = input_shape_ptr->shape(); + MS_ASSERT(input_shape != nullptr); + int ffn_hidden_size = (int64_t)input_shape[1]; + auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size); + MS_CHECK_TRUE_RET(decoder_layer_prim != nullptr, nullptr); + auto decoder_layer_prim_c = decoder_layer_prim->GetPrim(); + MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr); + auto value_node = NewValueNode(decoder_layer_prim_c); + MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); + std::vector new_node_inputs; + if (is_position_bias_) { + new_node_inputs = { + value_node, input, gamma1, weight_qkv, input_mask, position_bias, weight_attn_o, + gamma2, encoder_output, weight_attn_q, weight_attn_kv, cross_mask, position_bias_cross, weight_attn_cross_o, + gamma3, weight_m, weight_p}; + } else { + new_node_inputs = {value_node, + input, + gamma1, + beta1, + weight_qkv, + bias_attn_qkv, + input_mask, + weight_attn_o, + bias_attn_o, + gamma2, + beta2, + encoder_output, + weight_attn_q, + weight_attn_kv, + bias_attn_cross_qkv, + cross_mask, + weight_attn_cross_o, + bias_attn_cross_o, + gamma3, + beta3, + weight_m, + bias_m, + weight_p, + bias_p}; + } + auto new_node = func_graph->NewCNode(new_node_inputs); + MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); + auto old_node = node->cast(); + MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); + new_node->set_abstract(old_node->abstract()->Clone()); + new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer"); - // return new_node; - return nullptr; + return new_node; +} // namespace mindspore::opt } // namespace mindspore::opt -} // namespace mindspore::opt \ No newline at end of file diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h index 73bb8b15db9..edd8395bcc0 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -13,8 +13,8 @@ // * See the License for the specific language governing permissions and // * limitations under the License. // */ -#ifndef MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODERLAYER_FUSION_H_ -#define MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODERLAYER_FUSION_H_ +#ifndef MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODER_LAYER_FUSION_H_ +#define MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODER_LAYER_FUSION_H_ #include #include @@ -24,9 +24,9 @@ #include "include/common/utils/utils.h" #include "include/errorcode.h" #include "ops/decoder_layer.h" -#include "multi_head_attention_fusion.h" #include "ops/fusion/layer_norm_fusion.h" #include "ops/fusion/activation.h" +#include "tools/optimizer/fusion/multi_head_attention_fusion.h" namespace mindspore { namespace opt { @@ -57,6 +57,7 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { bool *is_position_bias2) const; AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const; + lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; protected: const std::string kPatternDecoderLayerPre = "PatternDecoderLayerPre"; @@ -105,4 +106,4 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { }; } // namespace opt } // namespace mindspore -#endif // MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODERLAYER_FUSION_H_ \ No newline at end of file +#endif // MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODER_LAYER_FUSION_H_ diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index c683f84ab4c..bd316868350 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -84,7 +84,7 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio return reshape1; } if (!layernorm_fusion) { - return DefineLayerNorm(is_position_bias, reshape1, gamma1_, beta1_); + return DefineLayerNorm(is_position_bias, reshape1, gamma1_, beta1_, eps1_); } auto layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_}); auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); @@ -93,7 +93,8 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio return tuple; } -VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta) const { +VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta, + VarPtr eps) const { auto var1 = std::make_shared("var1"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto is_reduce = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce"); @@ -110,11 +111,9 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef i auto is_reduce2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce2"); MS_CHECK_TRUE_RET(is_reduce2 != nullptr, {}); auto reduce2 = VectorRef({is_reduce2, sqr, var2}); - auto var3 = std::make_shared("var3"); - MS_CHECK_TRUE_RET(var3 != nullptr, {}); auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is-add"); MS_CHECK_TRUE_RET(is_add != nullptr, {}); - auto add = VectorRef({is_add, reduce2, var3}); + auto add = VectorRef({is_add, reduce2, eps}); auto is_sqr2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSqrt), "sqr2"); MS_CHECK_TRUE_RET(is_sqr2 != nullptr, {}); auto sqr2 = VectorRef({is_sqr2, add}); @@ -171,7 +170,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto var_tuple2 = std::make_shared("var_tuple2"); tuple2 = VectorRef({is_tuple2, layer_norm2, var_tuple2}); } else { - tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_); + tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_, eps2_); } auto is_reshape2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder2"); MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {}); @@ -219,7 +218,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto var_tuple3 = std::make_shared("var_tuple3"); tuple3 = VectorRef({is_tuple3, layer_norm, var_tuple3}); } else { - tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_); + tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_, eps1_); } auto is_reshape5 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); MS_CHECK_TRUE_RET(is_reshape5 != nullptr, {}); @@ -276,6 +275,26 @@ bool EncoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPt return true; } +STATUS EncoderLayerFusion::GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const{ + if ((*equiv)[node_name] == nullptr || !utils::isa((*equiv)[node_name])) { + MS_LOG(ERROR) << node_name << " is not anfnodeptr"; + return RET_ERROR; + } + AnfNodePtr node = utils::cast((*equiv)[node_name]); + MS_ASSERT(node != nullptr); + if (utils::isa(node)) { + auto value_ptr_node = utils::cast(node); + auto value_node = utils::cast(value_ptr_node->value()); + if (value_node->isa()) { + auto tensor = value_node->cast(); + MS_EXCEPTION_IF_NULL(tensor); + *eps = *reinterpret_cast(tensor->data().data()); + return RET_OK; + } + } + return RET_ERROR; +} + AnfNodePtr EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const { if ((*equiv)[node_name] == nullptr || !utils::isa((*equiv)[node_name])) { @@ -328,6 +347,16 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) { *eps2 = layrn2_prim->get_epsilon(); } + } else { + if (GetEps(equiv, eps1_, eps1) != RET_OK) { + MS_LOG(ERROR) << "not found eps1"; + return RET_ERROR; + } + + if (GetEps(equiv, eps2_, eps2) != RET_OK) { + MS_LOG(ERROR) << "not found eps2"; + return RET_ERROR; + } } if (!is_position_bias_) { if (!IsActGELU(func_graph, equiv, is_act_)) { diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index a2aef8e12b0..5bf5d191d36 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -53,11 +53,12 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { const std::string kPatternEncoderLayerT5Post = "PatternEncoderLayerT5Post"; VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; - VectorRef DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta) const; + VectorRef DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const; CNodePtr CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const AnfNodePtr &node, bool post_layernorm) const; AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const VarPtr &input_prim) const; + lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, float *eps1, float *eps2) const; std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, @@ -86,6 +87,8 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { mutable bool is_position_bias_{false}; mutable bool is_layernorm_fusion_{false}; mutable VarPtr is_act_{nullptr}; + mutable VarPtr eps1_{nullptr}; + mutable VarPtr eps2_{nullptr}; }; } // namespace opt } // namespace mindspore diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 383ce7641ff..099dd20effc 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_decoder_layer_t5,128];token_type_ids:[transformer_decoder_layer_t5,128];input_mask:[transformer_decoder_layer_t5,128] +input_shape=input_ids:[T5,128];token_type_ids:[T5,128];input_mask:[T5,128] diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index fcaf716d925..12cedb2812e 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,18 +1,24 @@ #run the following tests before push -#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -#-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross -#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross -#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 -#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -x 1 -m transformer_decoder_layer -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_decoder_layer +-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross +-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -x 1 -m transformer_decoder_layer +-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_decoder_layer +-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_decoder_layer +-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer +-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer +-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer_t5 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -x 1 -P 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -x 1 -P 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -m transformer_encoder_layer #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1 -- Gitee From efb75893f2772c96043f0894d913ee7d73ccf671 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Tue, 10 Jan 2023 09:23:10 +0200 Subject: [PATCH 19/39] fix the tensrrt files --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 14 ++++---------- .../delegate/tensorrt/op/decoder_tensorrt.h | 7 +++---- .../delegate/tensorrt/op/encoder_tensorrt.cc | 13 ++++++++----- .../delegate/tensorrt/op/encoder_tensorrt.h | 2 -- 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 4e7d9d2b96f..fa595874e02 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -33,8 +33,6 @@ namespace mindspore::lite { namespace { constexpr std::size_t kTwo = 2; -constexpr std::size_t kThree = 3; - } // namespace int DecoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, @@ -43,10 +41,10 @@ int DecoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std:: MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; } - // if (out_tensors.size() != 1) { - // MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); - // return RET_ERROR; - // } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } return RET_OK; } nvinfer1::ITensor *DecoderTensorRT::castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, @@ -134,13 +132,10 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.has_beta = !params.attn1.position_bias; params.has_bias = !params.attn1.position_bias; params.ffn_bias = !params.attn1.position_bias; - std::cout << "params.attn1.position_bias: " << params.attn1.position_bias - << "params.attn2.position_bias: " << params.attn2.position_bias << std::endl; auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.attn1.position_bias) ? C13NUM : C18NUM; size_t end_fp16 = (params.attn1.position_bias) ? C16NUM : C22NUM; - for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { @@ -280,7 +275,6 @@ size_t DecoderPlugin::getSerializationSize() const noexcept { return sizeof(int) + sizeof(fastertransformer::decoderParamT); } - void DecoderPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::decoderParamT)); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index 5060992be89..2bf6cc645fd 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -25,6 +25,7 @@ #include "src/fastertransformer/layers/ms_layers/decoder.h" #include "src/fastertransformer/layers/ms_layers/param.h" #include "src/extendrt/delegate/tensorrt/tensorrt_utils.h" + namespace mindspore::lite { class DecoderTensorRT : public TensorRTOp { public: @@ -52,8 +53,7 @@ class DecoderPlugin : public TensorRTPlugin { : TensorRTPlugin(name, std::string(DECODER_PLUGIN_NAME), device_id), compute_type_(compute_type), params_(params), - cublaslt_handle_(cublaslt_handle) - {} + cublaslt_handle_(cublaslt_handle) {} DecoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) { @@ -87,14 +87,13 @@ class DecoderPlugin : public TensorRTPlugin { int nbOutputs) noexcept override; private: - const std::string layer_name_; std::string name_space_; int compute_type_; mutable fastertransformer::decoderParamT params_; cublasLtHandle_t cublaslt_handle_; int num_of_inputs_; int num_of_outputs_; - + template int RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index cfe05de35d2..1b4a8ac12fb 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -34,18 +34,21 @@ namespace mindspore::lite { namespace { constexpr std::size_t kTwo = 2; -constexpr std::size_t kThree = 3; } // namespace -// Multi Head Attention TensorRT op int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } return RET_OK; } + nvinfer1::ITensor *EncoderTensorRT::castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name) { if (ctx == nullptr || ctx->network() == nullptr) { @@ -86,6 +89,7 @@ nvinfer1::ITensor *EncoderTensorRT::castTensor(TensorRTContext *ctx, const Tenso auto tensor_ptr = constant_tensor->getOutput(0); return tensor_ptr; } + int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { if (ctx == nullptr || ctx->network() == nullptr) { MS_LOG(ERROR) << "context or network is invalid"; @@ -115,9 +119,7 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn.is_cross = false; params.attn.position_bias = encoder_op->get_position_bias(); params.attn.projection_bias = !params.attn.position_bias; - std::cout << "params.attn.position_bias" << params.attn.position_bias << std::endl; params.attn.qkv_bias = !params.attn.position_bias; - params.attn.projection_bias = !params.attn.position_bias; params.has_beta = !params.attn.position_bias; params.has_bias = !params.attn.position_bias; params.ffn_bias = !params.attn.position_bias; @@ -138,7 +140,7 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); } else { in_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx, in_tensors_[i], op_name_); - ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); + compute_type_ ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); } } } @@ -209,6 +211,7 @@ bool EncoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTen bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type); return res; } + void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { const int request_batch_size = static_cast(in[0].desc.dims.d[0]); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index 4364410f9be..4e593b2bf02 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -23,7 +23,6 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" #include "src/fastertransformer/layers/ms_layers/encoder.h" -// #include "src/fastertransformer/layers/ms_layers/param.h" namespace mindspore::lite { class EncoderTensorRT : public TensorRTOp { @@ -87,7 +86,6 @@ class EncoderPlugin : public TensorRTPlugin { int nbOutputs) noexcept override; private: - const std::string layer_name_; std::string name_space_; int compute_type_; mutable fastertransformer::encoderParamT params_; -- Gitee From d2c771ee28ba7a1ca62decfa56e3c87db80ce8eb Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Tue, 10 Jan 2023 11:29:24 +0200 Subject: [PATCH 20/39] fix for code review --- .../delegate/tensorrt/op/encoder_tensorrt.cc | 2 +- .../optimizer/fusion/decoder_layer_fusion.cc | 2 -- .../optimizer/fusion/encoder_layer_fusion.cc | 23 +++++++++---------- .../fusion/multi_head_attention_fusion.cc | 2 +- .../fusion/multi_head_attention_fusion.h | 1 - 5 files changed, 13 insertions(+), 17 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 1b4a8ac12fb..e2a641f40dc 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -140,7 +140,7 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); } else { in_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx, in_tensors_[i], op_name_); - compute_type_ ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); + ctx->RegisterTensor(in_tensor, in_tensors_[i].Name()); } } } diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 60620e366c6..0c7b4872aa4 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -29,8 +29,6 @@ namespace mindspore::opt { namespace { const auto &p1 = std::placeholders::_1; -const size_t kWeightShapeSize = 2; -const int kDecoderLayerOutputs = 1; } // namespace bool DecoderLayerFusion::Init() const { diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index 6210403b197..348655b7d60 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -67,9 +67,13 @@ bool EncoderLayerFusion::Init() const { is_layernorm2_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm2"); MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false); position_bias_ = std::make_shared("position_bias"); - MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false); + MS_CHECK_TRUE_RET(position_bias_ != nullptr, false); is_act_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimActivation), "activation"); MS_CHECK_TRUE_RET(is_act_ != nullptr, {}); + eps1_ = std::make_shared("position_bias"); + MS_CHECK_TRUE_RET(eps1_ != nullptr, false); + eps2_ = std::make_shared("position_bias"); + MS_CHECK_TRUE_RET(eps2_ != nullptr, false); return true; } @@ -248,20 +252,15 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - std::cout << "found pattern=" << pattern_name << std::endl; - if (pattern_name == kPatternTEncoderLayerPostNorm || pattern_name == kPatternTEncoderLayerPreNorm) is_layernorm_fusion_ = true; - if (pattern_name == kPatternTEncoderLayerPost || pattern_name == kPatternTEncoderLayerPostNorm) { + if (pattern_name == kPatternEncoderLayerT5Pre || pattern_name == kPatternEncoderLayerT5Post) is_position_bias_ = true; + if (pattern_name == kPatternTEncoderLayerPost || pattern_name == kPatternTEncoderLayerPostNorm || + pattern_name == kPatternEncoderLayerT5Post) return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true); - } else if (pattern_name == kPatternTEncoderLayerPre || pattern_name == kPatternTEncoderLayerPreNorm) { - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false); - } else if (pattern_name == kPatternEncoderLayerT5Pre || pattern_name == kPatternEncoderLayerT5Post) { - is_position_bias_ = true; - if (pattern_name == kPatternEncoderLayerT5Post) - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true); + else if (pattern_name == kPatternTEncoderLayerPre || pattern_name == kPatternTEncoderLayerPreNorm || + pattern_name == kPatternEncoderLayerT5Pre) return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false); - } return nullptr; } @@ -326,6 +325,7 @@ AnfNodePtr EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, cons auto input = cnode->input(0); return input; } + STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, float *eps1, float *eps2) const { auto attn_input = GetAttribute(func_graph, equiv, is_attention_); @@ -427,7 +427,6 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP auto value_node = NewValueNode(encoder_layer_prim_c); MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); std::vector new_node_inputs; - ParameterPtr c_bias_m_param, c_weight_p_param, c_bias_p_param, c_weight_m_param; if (is_position_bias_) { position_bias = utils::cast((*equiv)[position_bias_]); new_node_inputs = {value_node, input, gamma1, weight_qkv, input_mask, diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index cc5db31c7fd..d666ae02b4d 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -613,7 +613,7 @@ AnfNodePtr MultiHeadAttentionFusion::Process(const std::string &pattern_name, co if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - std::cout << "found pattern=" << pattern_name << std::endl; + std::cout << "found pattern=" << pattern_name << std::endl; if ((pattern_name == kMPAWithMaskPatternName) || (pattern_name == kMPAWithMaskPatternNamePA) || (pattern_name == kMPAWithMaskPatternNameT5) || (pattern_name == kMPAWithMaskPatternNameT5New) || diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h index 980ab035119..ebe365273de 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h @@ -116,7 +116,6 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass { mutable VarPtr reshape_v_{nullptr}; mutable VarPtr reshape_axis_{nullptr}; - // mutable VarPtr q_transpose_{nullptr}; mutable VarPtr v_transpose_{nullptr}; mutable VarPtr k_transpose_{nullptr}; -- Gitee From e656fb85b18842911842a11eaa3890e331969c72 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Thu, 12 Jan 2023 11:05:01 +0200 Subject: [PATCH 21/39] add param to enoder and decoder op --- .../cpu/kernel/nnacl/attention_parameter.h | 1 + .../kernel/nnacl/decoder_layer_parameter.h | 3 + .../kernel/nnacl/encoder_layer_parameter.h | 2 + mindspore/core/ops/attention.cc | 12 ++- mindspore/core/ops/attention.h | 4 +- mindspore/core/ops/decoder_layer.cc | 26 +++++- mindspore/core/ops/decoder_layer.h | 15 +++- mindspore/core/ops/encoder_layer.cc | 19 +++- mindspore/core/ops/encoder_layer.h | 10 ++- mindspore/core/ops/op_name.h | 3 + mindspore/lite/schema/ops.fbs | 17 ++-- mindspore/lite/src/common/ops/ops_def.cc | 6 ++ .../ops/populate/decoder_layer_populate.cc | 5 +- .../ops/populate/encoder_layer_populate.cc | 2 + .../delegate/tensorrt/op/decoder_tensorrt.cc | 6 +- .../delegate/tensorrt/op/decoder_tensorrt.h | 2 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 4 +- .../delegate/tensorrt/op/encoder_tensorrt.h | 2 +- .../delegate/tensorrt/op/mha_tensorrt.cc | 4 + .../optimizer/fusion/decoder_layer_fusion.cc | 26 ++++-- .../optimizer/fusion/decoder_layer_fusion.h | 3 +- .../optimizer/fusion/encoder_layer_fusion.cc | 19 ++-- .../optimizer/fusion/encoder_layer_fusion.h | 3 +- .../fusion/multi_head_attention_fusion.cc | 3 +- .../fusion/multi_head_attention_fusion.h | 1 + trc/transformer/cfg_bert.config | 2 +- trc/transformer/ftBench.py | 1 + trc/transformer/models.txt | 16 ++-- trc/transformer/t.config | 2 +- trc/transformer/train_transformer_export.py | 89 ++++++++++++------- 30 files changed, 225 insertions(+), 83 deletions(-) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h index c3b600b234f..bcfb0ab38a7 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h @@ -23,6 +23,7 @@ typedef struct AttentionParameter { int head_num_; int head_size_; bool cross_; + bool scalar_; } AttentionParameter; typedef struct RelativePositionAttentionParameter { diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h index 05872f3a240..3535e5f55e2 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h @@ -30,6 +30,9 @@ typedef struct DecoderLayerParameter { int ffn_hidden_size_; bool position_bias1_; bool position_bias2_; + bool scalar1; + bool scalar2; + char* act; } DecoderLayerParameter; #endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h index 30b2412993c..df6c97d132f 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h @@ -28,6 +28,8 @@ typedef struct EncoderLayerParameter { float eps_layernorm2_; int ffn_hidden_size_; bool position_bias_; + bool scalar; + const char *act; } EncoderLayerParameter; #endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_ENCODER_LAYER_PARAMETER_H_ diff --git a/mindspore/core/ops/attention.cc b/mindspore/core/ops/attention.cc index e6f26cbb28a..464ad1b4abc 100644 --- a/mindspore/core/ops/attention.cc +++ b/mindspore/core/ops/attention.cc @@ -34,7 +34,9 @@ void Attention::set_cross(bool cross) { (void)this->AddAttr(kCross, api::MakeVal void Attention::set_position_bias(bool position_bias) { (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias)); } - +void Attention::set_scalar(bool scalar) { + (void)this->AddAttr(kScalar, api::MakeValue(scalar)); +} int64_t Attention::get_head_num() const { auto value_ptr = this->GetAttr(kAttentionNumHeads); return GetValue(value_ptr); @@ -54,12 +56,16 @@ bool Attention::get_position_bias() const { auto value_ptr = this->GetAttr(kPositionBias); return GetValue(value_ptr); } - -void Attention::Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross) { +bool Attention::get_scalar() const { + auto value_ptr = this->GetAttr(kScalar); + return GetValue(value_ptr); +} +void Attention::Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross, bool scalar) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_cross(cross); this->set_position_bias(position_bias); + this->set_scalar(scalar); } REGISTER_PRIMITIVE_C(kNameAttention, Attention); } // namespace mindspore::ops diff --git a/mindspore/core/ops/attention.h b/mindspore/core/ops/attention.h index 24b0a98f3f6..0d0cccc5d81 100644 --- a/mindspore/core/ops/attention.h +++ b/mindspore/core/ops/attention.h @@ -41,15 +41,17 @@ class MIND_API Attention : public BaseOperator { /// \param[in] head_size Define size per head. /// \param[in] cross Define is cross attention. Default false. /// \param[in] position_bias Define is position bias attention. - void Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross = false); + void Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross = false, bool scalar = true); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_cross(bool cross); void set_position_bias(bool position_bias); + void set_scalar(bool scalar); int64_t get_head_num() const; int64_t get_head_size() const; bool get_cross() const; bool get_position_bias() const; + bool get_scalar() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc index 22c47ff8d89..ea64ed3710f 100644 --- a/mindspore/core/ops/decoder_layer.cc +++ b/mindspore/core/ops/decoder_layer.cc @@ -48,6 +48,15 @@ void DecoderLayer::set_eps_layernorm3(float eps_layernorm3) { } void DecoderLayer::set_position_bias1(bool position_bias1) { (void)this->AddAttr(kDecoderLayerPositionBias1, api::MakeValue(position_bias1)); } void DecoderLayer::set_position_bias2(bool position_bias2) { (void)this->AddAttr(kDecoderLayerPositionBias2, api::MakeValue(position_bias2)); } +void DecoderLayer::set_scalar1(bool scalar1) { + (void)this->AddAttr(kDecoderLayerScalar1, api::MakeValue(scalar1)); +} +void DecoderLayer::set_scalar2(bool scalar2) { + (void)this->AddAttr(kDecoderLayerScalar2, api::MakeValue(scalar2)); +} +void DecoderLayer::set_act(std::string act) { + (void)this->AddAttr(kActivation, api::MakeValue(act)); +} int64_t DecoderLayer::get_head_num() const { auto value_ptr = this->GetAttr(kDecoderLayerNumHeads); return GetValue(value_ptr); @@ -87,9 +96,21 @@ bool DecoderLayer::get_position_bias2() const { auto value_ptr = this->GetAttr(kDecoderLayerPositionBias2); return GetValue(value_ptr); } +bool DecoderLayer::get_scalar1() const { + auto value_ptr = this->GetAttr(kDecoderLayerScalar1); + return GetValue(value_ptr); +} +bool DecoderLayer::get_scalar2() const { + auto value_ptr = this->GetAttr(kDecoderLayerScalar2); + return GetValue(value_ptr); +} +std::string DecoderLayer::get_act() const { + auto value_ptr = this->GetAttr(kActivation); + return GetValue(value_ptr); +} void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, int64_t ffn_hidden_size, - bool position_bias1, bool position_bias2, bool post_layernorm = false) { + bool position_bias1, bool position_bias2, bool post_layernorm = false, bool scalar1, bool scalar2, std::string act) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); @@ -99,6 +120,9 @@ void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm this->set_ffn_hidden_size(ffn_hidden_size); this->set_position_bias1(position_bias1); this->set_position_bias2(position_bias2); + this->set_act(act); + this->set_scalar1(scalar1); + this->set_scalar2(scalar2); } REGISTER_PRIMITIVE_C(kNameDecoderLayer, DecoderLayer); } // namespace mindspore::ops diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h index 71425ab63d1..493f24965ad 100644 --- a/mindspore/core/ops/decoder_layer.h +++ b/mindspore/core/ops/decoder_layer.h @@ -64,11 +64,14 @@ class MIND_API DecoderLayer : public BaseOperator { /// \param[in] eps_layernorm2 Define eps layernorm2. /// \param[in] eps_layernorm3 Define eps layernorm3. /// \param[in] ffn_hidden_size Define ffn hidden size. - /// \param[in] position_bias1 Define ffn position_bias1. - /// \param[in] position_bias2 Define ffn position_bias2. + /// \param[in] position_bias1 Define position_bias1. + /// \param[in] position_bias2 Define position_bias2. + /// \param[in] scalar1 Define scalar1. + /// \param[in] scalar2 Define scalar2. + /// \param[in] act Define act void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, - bool post_layernorm); + bool post_layernorm, bool scalar1 = true, bool scalar2 = true, std::string act = "gelu"); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_post_layernorm(bool post_layernorm); @@ -78,6 +81,9 @@ class MIND_API DecoderLayer : public BaseOperator { void set_ffn_hidden_size(int64_t ffn_hidden_size); void set_position_bias1(bool position_bias1); void set_position_bias2(bool position_bias2); + void set_scalar1(bool scalar1); + void set_scalar2(bool scalar2); + void set_act(std::string act); int64_t get_head_num() const; int64_t get_head_size() const; bool get_post_layernorm() const; @@ -87,6 +93,9 @@ class MIND_API DecoderLayer : public BaseOperator { int64_t get_ffn_hidden_size() const; bool get_position_bias1() const; bool get_position_bias2() const; + bool get_scalar1() const; + bool get_scalar2() const; + std::string get_act() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc index 19f40d706e7..3416213f8bf 100644 --- a/mindspore/core/ops/encoder_layer.cc +++ b/mindspore/core/ops/encoder_layer.cc @@ -46,7 +46,12 @@ void EncoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) { void EncoderLayer::set_position_bias(bool position_bias) { (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias)); } - +void EncoderLayer::set_scalar(bool scalar) { + (void)this->AddAttr(kScalar, api::MakeValue(scalar)); +} +void EncoderLayer::set_act(std::string act) { + (void)this->AddAttr(kActivation, api::MakeValue(act)); +} int64_t EncoderLayer::get_head_num() const { auto value_ptr = this->GetAttr(kEncoderLayerNumHeads); return GetValue(value_ptr); @@ -77,8 +82,16 @@ bool EncoderLayer::get_position_bias() const { auto value_ptr = this->GetAttr(kPositionBias); return GetValue(value_ptr); } +bool EncoderLayer::get_scalar() const { + auto value_ptr = this->GetAttr(kScalar); + return GetValue(value_ptr); +} +std::string EncoderLayer::get_act() const { + auto value_ptr = this->GetAttr(kActivation); + return GetValue(value_ptr); +} void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, - int64_t ffn_hidden_size, bool position_bias, bool post_layernorm = false) { + int64_t ffn_hidden_size, bool position_bias, bool post_layernorm = false, bool scalar = true, std::string act = "gelu") { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); @@ -86,6 +99,8 @@ void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm this->set_eps_layernorm2(eps_layernorm2); this->set_ffn_hidden_size(ffn_hidden_size); this->set_position_bias(position_bias); + this->set_act(act); + this->set_scalar(scalar); } REGISTER_PRIMITIVE_C(kNameEncoderLayer, EncoderLayer); } // namespace mindspore::ops diff --git a/mindspore/core/ops/encoder_layer.h b/mindspore/core/ops/encoder_layer.h index 728d02a3576..8ba624d19e4 100644 --- a/mindspore/core/ops/encoder_layer.h +++ b/mindspore/core/ops/encoder_layer.h @@ -42,9 +42,11 @@ class MIND_API EncoderLayer : public BaseOperator { /// \param[in] eps_layernorm1 Define eps layernorm1. /// \param[in] eps_layernorm2 Define eps layernorm2. /// \param[in] ffn_hidden_size Define ffn hidden size. - /// \param[in] position_bias Define ffn position_bias. + /// \param[in] position_bias Define ffn. + /// \param[in] scalar Define scalar. + /// \param[in] act Define act. void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, - bool position_bias, bool post_layernorm); + bool position_bias, bool post_layernorm, bool scalar, std::string act); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_post_layernorm(bool post_layernorm); @@ -52,6 +54,8 @@ class MIND_API EncoderLayer : public BaseOperator { void set_eps_layernorm2(float eps_layernorm2); void set_ffn_hidden_size(int64_t ffn_hidden_size); void set_position_bias(bool position_bias); + void set_scalar(bool scalar); + void set_act(std::string act); int64_t get_head_num() const; int64_t get_head_size() const; bool get_post_layernorm() const; @@ -59,6 +63,8 @@ class MIND_API EncoderLayer : public BaseOperator { float get_eps_layernorm2() const; int64_t get_ffn_hidden_size() const; bool get_position_bias() const; + bool get_scalar() const; + std::string get_act() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h index f333be41b25..1f8d61e0e4e 100644 --- a/mindspore/core/ops/op_name.h +++ b/mindspore/core/ops/op_name.h @@ -379,6 +379,7 @@ constexpr auto kSampleNum = "sample_num"; constexpr auto kRoiEndMode = "roi_end_mode"; constexpr auto kUpper = "upper"; constexpr auto kConjugate = "conjugate"; +constexpr auto kScalar = "scalar"; constexpr auto kEncoderLayerNumHeads = "head_num"; constexpr auto kEncoderLayerSizePerHead = "head_size"; constexpr auto kEncoderLayerPostLayernorm = "post_layernorm"; @@ -394,6 +395,8 @@ constexpr auto kDecoderLayerEpsLayerNorm2 = "eps_layernorm2"; constexpr auto kDecoderLayerEpsLayerNorm3 = "eps_layernorm3"; constexpr auto kDecoderLayerPositionBias1 = "position_bias1"; constexpr auto kDecoderLayerPositionBias2 = "position_bias2"; +constexpr auto kDecoderLayerScalar1 = "scalar"; +constexpr auto kDecoderLayerScalar2 = "scalar"; constexpr auto kPositionBias = "position_bias"; constexpr auto KExclusive = "exclusive"; constexpr auto KReverse = "reverse"; diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index d1ac11c31e9..e5c447204c4 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -397,7 +397,7 @@ table Attention { head_num: long; head_size: long; cross: bool; - position_bias: bool; + scalar: bool; } table Conv2DBackpropFilterFusion { @@ -1302,6 +1302,10 @@ table Log1p { table TensorScatterAdd { } +table ScatterElements { + axis: long; +} + table EncoderLayer { head_num: long; head_size: long; @@ -1310,8 +1314,10 @@ table EncoderLayer { eps_layernorm2: float; ffn_hidden_size: long; position_bias: bool; + scalar: bool; + act: string; } - + table DecoderLayer { head_num: long; head_size: long; @@ -1322,8 +1328,7 @@ table DecoderLayer { ffn_hidden_size: long; position_bias1: bool; position_bias2: bool; -} - -table ScatterElements { - axis: long; + scalar1: bool; + scalar2: bool; + act: string; } diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc index 12ae6875a61..910bfa99f3b 100644 --- a/mindspore/lite/src/common/ops/ops_def.cc +++ b/mindspore/lite/src/common/ops/ops_def.cc @@ -397,6 +397,7 @@ OP_SCHEMA_DEF(Attention) OP_ATTR(head_num, long) OP_ATTR(head_size, long); OP_ATTR(cross, bool) +OP_ATTR(scalar, bool) OP_SCHEMA_DEF_END(Attention) OP_SCHEMA_DEF(Conv2DBackpropFilterFusion) @@ -1317,6 +1318,8 @@ OP_ATTR(eps_layernorm1, float) OP_ATTR(eps_layernorm2, float) OP_ATTR(ffn_hidden_size, long) OP_ATTR(position_bias, bool) +OP_ATTR(scalar, bool) +OP_ATTR(act, string) OP_SCHEMA_DEF_END(EncoderLayer) OP_SCHEMA_DEF(DecoderLayer) @@ -1329,4 +1332,7 @@ OP_ATTR(eps_layernorm3, float) OP_ATTR(ffn_hidden_size, long) OP_ATTR(position_bias1, bool) OP_ATTR(position_bias2, bool) +OP_ATTR(scalar1, bool) +OP_ATTR(scalar2, bool) +OP_ATTR(act, string) OP_SCHEMA_DEF_END(DecoderLayer) diff --git a/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc b/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc index 125f5949de9..bbafa50d8b6 100644 --- a/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc +++ b/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc @@ -38,8 +38,11 @@ OpParameter *PopulateDecoderLayerParameter(const void *prim) { param->eps_layernorm1_ = value->eps_layernorm1(); param->eps_layernorm2_ = value->eps_layernorm2(); param->eps_layernorm3_ = value->eps_layernorm3(); - param->position_bias1_ = value->position_bias2(); + param->position_bias1_ = value->position_bias1(); param->position_bias2_ = value->position_bias2(); + param->scalar1 = value->scalar1(); + param->scalar2 = value->scalar2(); + // param->act = value->act()->c_str(); return reinterpret_cast(param); } diff --git a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc b/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc index 3b636ad5d7f..0aedd56fdc8 100644 --- a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc +++ b/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc @@ -39,6 +39,8 @@ OpParameter *PopulateEncoderLayerParameter(const void *prim) { param->eps_layernorm2_ = value->eps_layernorm2(); param->ffn_hidden_size_ = value->ffn_hidden_size(); param->position_bias_ = value->position_bias(); + param->scalar = value->scalar(); + param->act = value->act()->c_str(); return reinterpret_cast(param); } diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 4e7d9d2b96f..c7bbc110876 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -130,12 +130,12 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.projection_bias = !params.attn2.position_bias; params.attn2.is_cross = true; params.attn2.cublas_handle = GetCublasHandle(); - params.is_act = !params.attn1.position_bias; + params.is_act = decoder_op->get_act().c_str();//params.attn1.position_bias ? "relu" : "gelu" ; params.has_beta = !params.attn1.position_bias; params.has_bias = !params.attn1.position_bias; params.ffn_bias = !params.attn1.position_bias; - std::cout << "params.attn1.position_bias: " << params.attn1.position_bias - << "params.attn2.position_bias: " << params.attn2.position_bias << std::endl; + params.attn1.scalar = decoder_op->get_scalar1(); + params.attn2.scalar = decoder_op->get_scalar2(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.attn1.position_bias) ? C13NUM : C18NUM; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index 5060992be89..a7006edad2d 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -41,7 +41,7 @@ class DecoderTensorRT : public TensorRTOp { private: nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); - bool is_ffn_fp16_ = true; + bool is_ffn_fp16_ = false; }; constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"}; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index cfe05de35d2..868ac4d11ca 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -115,13 +115,13 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn.is_cross = false; params.attn.position_bias = encoder_op->get_position_bias(); params.attn.projection_bias = !params.attn.position_bias; - std::cout << "params.attn.position_bias" << params.attn.position_bias << std::endl; params.attn.qkv_bias = !params.attn.position_bias; params.attn.projection_bias = !params.attn.position_bias; params.has_beta = !params.attn.position_bias; params.has_bias = !params.attn.position_bias; params.ffn_bias = !params.attn.position_bias; - params.is_act = !params.attn.position_bias; + params.is_act = encoder_op->get_act().c_str();//params.attn.position_bias ? "relu" : "gelu";//encoder_op->get_act(); + params.attn.scalar = encoder_op->get_scalar();//!params.attn.position_bias; auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index 4364410f9be..f73beb598da 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -41,7 +41,7 @@ class EncoderTensorRT : public TensorRTOp { private: nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); - bool is_ffn_fp16_ = true; + bool is_ffn_fp16_ = false; }; constexpr auto ENCODER_PLUGIN_NAME{"EncoderPlugin"}; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index d2f3a69aded..dbeb5ed610f 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -81,6 +81,7 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { params.projection_bias = !is_position_bias; params.is_cross = is_cross; params.position_bias = is_position_bias; + params.scalar = !is_position_bias; auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); const int input_number = inputs().size(); @@ -93,6 +94,9 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { MS_LOG(ERROR) << "add mha op failed for TensorRT."; return RET_ERROR; } + std::cout<<"params.is_cross: "<setName((op_name_ + "plugin_attention").c_str()); nvinfer1::ITensor *attn_tensor = mha_layer->getOutput(0); #ifndef TEST_ diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 51a32e643c9..9d13010a06c 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -396,7 +396,7 @@ STATUS GetAxis(const ValueNodePtr n, std::vector *axes) { STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1, - bool *is_position_bias2) const { + bool *is_position_bias2, bool *scalar1, bool *scalar2) const { auto attn_input = GetAttribute(func_graph, equiv, is_attention_); MS_ASSERT(attn_input != nullptr); auto attn_prim = ops::GetOperator(attn_input); @@ -409,6 +409,9 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { *is_position_bias1 = attn_prim->get_position_bias(); } + if (attn_prim->GetAttr(ops::kScalar) != nullptr) { + *scalar1 = attn_prim->get_scalar(); + } if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { MS_LOG(ERROR) << "is_attention_ is not AnfNodePtr"; return RET_ERROR; @@ -418,6 +421,9 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq auto attn_cross_prim = ops::GetOperator(attn_cross_input); if (attn_cross_prim->GetAttr(ops::kPositionBias) != nullptr) { *is_position_bias2 = attn_cross_prim->get_position_bias(); + } + if (attn_cross_prim->GetAttr(ops::kScalar) != nullptr) { + *scalar2 = attn_cross_prim->get_scalar(); } if (is_layernorm_fusion_) { auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); @@ -465,19 +471,24 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq // return false; // } // std::cout << GetValue(eps1_value_ptr) <(GetAttribute(func_graph, equiv, eps2_)); // std::cout << GetValue(eps2_value_ptr) <(GetAttribute(func_graph, equiv, eps3_)); - *eps3=1e-6; + *eps3=1e-5; + } } if (!is_position_bias_) { if (!IsActGELU(func_graph, equiv)) { return false; } + act_ = "gelu"; + } + else{ + act_= "relu"; } return RET_OK; } @@ -496,14 +507,17 @@ std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGrap float eps3 = 1e-6; bool is_position_bias1 = false; bool is_position_bias2 = false; + bool scalar1 = true; + bool scalar2 = true; if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1, - &is_position_bias2)) { + &is_position_bias2, &scalar1, &scalar2)) { + std::cout<<"act fusion"<Init(head_num, head_size, eps1, eps2, eps3, ffn_hidden_size, is_position_bias1, is_position_bias2, - post_layernorm); + post_layernorm, scalar1, scalar2, act_); return decoder_layer_prim; } diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h index 73bb8b15db9..7fd40c8cd7c 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -54,7 +54,7 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { bool post_layernorm, int64_t ffn_hidden_size) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1, - bool *is_position_bias2) const; + bool *is_position_bias2, bool *scalar1, bool *scalar2) const; AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const; @@ -102,6 +102,7 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr eps3_{nullptr}; mutable bool is_position_bias_{false}; mutable bool is_layernorm_fusion_{false}; + mutable std::string act_{"gelu"}; }; } // namespace opt } // namespace mindspore diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index ac761e2a546..cc2c8f25fc2 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -308,7 +308,7 @@ AnfNodePtr EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, cons return input; } STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, - int *head_size, float *eps1, float *eps2) const { + int *head_size, float *eps1, float *eps2, bool *scalar) const { auto attn_input = GetAttribute(func_graph, equiv, is_attention_); MS_ASSERT(attn_input != nullptr); auto attn_prim = ops::GetOperator(attn_input); @@ -320,6 +320,9 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq } if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { is_position_bias_ = attn_prim->get_position_bias(); + } + if (attn_prim->GetAttr(ops::kScalar) != nullptr) { + *scalar = attn_prim->get_scalar(); } if (is_layernorm_fusion_) { auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); @@ -337,6 +340,10 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (!IsActGELU(func_graph, equiv, is_act_)) { return RET_ERROR; } + act_= "gelu"; + } + else{ + act_= "relu"; } return RET_OK; } @@ -350,12 +357,14 @@ std::shared_ptr EncoderLayerFusion::CreatePrim(const FuncGrap } int head_num = 0; int head_size = 0; - float eps1 = 1e-6; - float eps2 = 1e-6; - if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2)) { + float eps1 = 1e-5; + float eps2 = 1e-5; + bool scalar = true; + if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &scalar)) { return nullptr; } - encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm); + std::cout<<"act fusion"<Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm, scalar ,act_); return encoder_layer_prim; } diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index a2aef8e12b0..3525095a67b 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -59,7 +59,7 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const VarPtr &input_prim) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, - float *eps1, float *eps2) const; + float *eps1, float *eps2, bool *scalar) const; std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const; @@ -85,6 +85,7 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr is_layernorm2_{nullptr}; mutable bool is_position_bias_{false}; mutable bool is_layernorm_fusion_{false}; + mutable std::string act_{"gelu"}; mutable VarPtr is_act_{nullptr}; }; } // namespace opt diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index cc5db31c7fd..54297ee34dc 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -621,6 +621,7 @@ AnfNodePtr MultiHeadAttentionFusion::Process(const std::string &pattern_name, co if (pattern_name == kMPAWithMaskPatternNameT5New || pattern_name == kMPAWithMaskTransposePatternNameT5New || pattern_name == kMPAWithMaskPatternNameT5New2) { t5_x_ = true; + scalar_ = (pattern_name == kMPAWithMaskPatternNameT5New2) ? false : true; } return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), true); } @@ -759,7 +760,7 @@ std::shared_ptr MultiHeadAttentionFusion::CreatePrim(const Equiv if (!CheckPattern(equiv, &head_num, &head_size)) { return nullptr; } - attention_prim->Init(head_num, head_size, t5_x_, cross); + attention_prim->Init(head_num, head_size, t5_x_, cross, scalar_); return attention_prim; } diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h index 980ab035119..5ff33aa3a74 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h @@ -121,6 +121,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass { mutable VarPtr k_transpose_{nullptr}; mutable bool t5_x_{false}; + mutable bool scalar_{true}; }; } // namespace opt } // namespace mindspore diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 99e4f5bd9ab..cc543ad3d77 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_encoder_layer,128];token_type_ids:[transformer_encoder_layer,128];input_mask:[transformer_encoder_layer,128] +input_shape=input_ids:[transformer_decoder_layer,128];token_type_ids:[transformer_decoder_layer,128];input_mask:[transformer_decoder_layer,128] diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 106d3763d07..5037376a671 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -98,6 +98,7 @@ for line_model_arg in models_arg: if batch_size!='1': model_name+=batch_size os.system(f"rm -f {base}/trc/transformer/{model_name}* {base}/trc/transformer/convv_{model_name}*") + os.system(f"cp /home/batya/git-proj/transformer_repo/transformer/models/t5/T5Transformer.py .") ret = os.system(f"docker run --user \"$(id -u):$(id -g)\" -w {base}/trc/transformer --runtime=nvidia -v {base}/../:{base}/../ -v /opt/share:/opt/share --privileged=true {image} python {base}/trc/transformer/train_transformer_export.py {line_model_arg} " ) ret=0 if ret != 0: exit() diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 2f547114d30..b19856ddd68 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,15 +1,14 @@ -#not work #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer_t5 # --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #run the following tests before push @@ -24,10 +23,11 @@ #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert + +#-b 1 -l 12 -H 12 -S 768 -s 128 -m T5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -f 3072 -x 0 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 @@ -56,7 +56,7 @@ #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -H 4 -S 1024 -p 0 -m mha_x1 -#-b 1 -l 2 -s 128 -H 2 -S 8 -m T5 +-b 1 -l 2 -s 12 -t 12 -H 2 -S 4 -m T5 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 4 -S 512 -s 128 -f 3072 -P 1 -m transformer_encoder_layer diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 0ecc92cc5ec..5cfcbc353ed 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,4 +1,4 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" -#fusion_blacklists="EncoderLayerFusion" +#fusion_blacklists="EncoderLayerFusion","DecoderLayerFusion" #fusion_blacklists="DecoderLayerFusion" diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index ae295928e0e..6f7d8b6f900 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -12,6 +12,7 @@ sys.path.append(model_zoo_path) sys.path.append("../../../transformer/transformer/models") sys.path.append("./T5") from MultiHeadTester import MultiHeadAttentionX, TransformerDecoderLayerX,TransformerEncoderLayerX,FeedForwardX +import T5Transformer as T from mindspore.common.parameter import Parameter from mindspore.common.initializer import Tensor import mindspore as M @@ -344,15 +345,16 @@ def transformer_encoder_layer_t5_create(): name = "transformer_encoder_layer_t5" if (post_layernorm): print("post_layernorm") - model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, - num_heads=head_num, post_layernorm_residual=True, has_bias=False, hidden_act=None) + model = T.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, post_layernorm_residual=True, has_bias=True, hidden_act=None,attention_dropout_rate=0.0,hidden_dropout_rate=0.0) else: - model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, - num_heads=head_num, has_bias=False, hidden_act=None) + model = T.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, has_bias=True, hidden_act=None,attention_dropout_rate=0.0,hidden_dropout_rate=0.0) encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) - pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) - + # pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + # encoder_input_value = M.Tensor(np.zeros((batch, seq, hid_size)), M.float32) + # encoder_input_mask = M.Tensor(np.zeros((batch, seq, seq)), M.float32) q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size k = model.attention.dense2.weight.asnumpy()#.transpose() v = model.attention.dense3.weight.asnumpy()#.transpose() @@ -363,31 +365,30 @@ def transformer_encoder_layer_t5_create(): wp = model.attention.projection.weight omw = model.output.mapping.weight opw = model.output.projection.weight - gl1 = model.layernorm1.weight - gl2 = model.layernorm2.weight + gl1 = model.layernorm1.gamma + gl2 = model.layernorm2.gamma suffix = str(compute_type) suffix = suffix[-2:] saveT(encoder_input_value, name + "_input1.fp" + suffix) saveT(encoder_input_mask, name + "_input2.fp" + suffix) - saveT(pos, name + "_input3.fp" + suffix) - saveT(gl1, name + "_weight1.fp" + suffix) - saveT(wt, name + "_weight2.fp" + suffix) - saveT(wp, name + "_weight3.fp" + suffix) - saveT(gl2, name + "_weight4.fp" + suffix) - if ffn_fp16 == True: - saveTensorToHalf(omw, name + "_weight5.fp" + "16") - saveTensorToHalf(opw, name + "_weight6.fp" + "16") - else: - saveT(omw, name + "_weight5.fp" + suffix) - saveT(opw, name + "_weight6.fp" + suffix) + # saveT(pos, name + "_input3.fp" + suffix) + # saveT(gl1, name + "_weight1.fp" + suffix) + # saveT(wt, name + "_weight2.fp" + suffix) + # saveT(wp, name + "_weight3.fp" + suffix) + # saveT(gl2, name + "_weight4.fp" + suffix) + # if ffn_fp16 == True: + # saveTensorToHalf(omw, name + "_weight5.fp" + "16") + # saveTensorToHalf(opw, name + "_weight6.fp" + "16") + # else: + # saveT(omw, name + "_weight5.fp" + suffix) + # saveT(opw, name + "_weight6.fp" + suffix) _cell_graph_executor.compile(model, encoder_input_value, - encoder_input_mask, - pos) - y = model(encoder_input_value, encoder_input_mask, position_bias = pos) + encoder_input_mask) + y = model(encoder_input_value, encoder_input_mask) print('name=',name) - export(model, encoder_input_value, encoder_input_mask, pos, file_name= name + "_fwd", file_format='MINDIR') + export(model, encoder_input_value, encoder_input_mask, file_name= name + "_fwd", file_format='MINDIR') # if app=="ch": f_y=open(f'./{name}_output.txt','w') out_name='output1' @@ -404,19 +405,38 @@ def transformer_decoder_layer_t5_create(): name = "transformer_decoder_layer_t5" if (post_layernorm): print("post_layernorm true") - model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False, hidden_act=None) + model = T.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False, hidden_act="relu") else: print("post_layernorm false") - model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, - tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False, hidden_act=None) + model = T.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False, hidden_act="relu") hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) encoder_pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) - + actual_seq = seq // 2 + if compress: + input_value = hidden_stats.asnumpy() + input_value[:,actual_seq:,:] = 0 + hidden_stats = M.Tensor.from_numpy(input_value) + decoder_input_mask_value = decoder_mask.asnumpy() + decoder_input_mask_value[:,:,actual_seq:] = 0 + decoder_mask = M.Tensor.from_numpy(decoder_input_mask_value) + encoder_output_value = encoder_output.asnumpy() + encoder_output_value[:,:,actual_seq:] = 0 + encoder_output = M.Tensor.from_numpy(encoder_output_value) + memory_mask_value = memory_mask.asnumpy() + memory_mask_value[:,:,actual_seq:] = 0 + memory_mask = M.Tensor.from_numpy(memory_mask_value) + pos_value = pos.asnumpy() + pos_value[:,:,actual_seq:] = 0 + pos = M.Tensor.from_numpy(pos_value) + encoder_pos_value = encoder_pos.asnumpy() + encoder_pos_value[:,:,actual_seq:] = 0 + encoder_pos = M.Tensor.from_numpy(encoder_pos_value) q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size k = model.attention.dense2.weight.asnumpy()#.transpose() v = model.attention.dense3.weight.asnumpy()#.transpose() @@ -438,9 +458,9 @@ def transformer_decoder_layer_t5_create(): print('omw.asnumpy().shape',omw.asnumpy().shape) opw = model.output.projection.weight - gl1 = model.layernorm1.weight - gl2 = model.layernorm2.weight - gl3 = model.cross_attention_layernorm.weight + gl1 = model.layernorm1.gamma + gl2 = model.layernorm2.gamma + gl3 = model.cross_attention_layernorm.gamma suffix = str(compute_type) suffix = suffix[-2:] @@ -465,9 +485,12 @@ def transformer_decoder_layer_t5_create(): saveT(pos, name + "_input5.fp" + suffix) saveT(encoder_pos, name + "_input6.fp" + suffix) _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos) - y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , position_bias=pos, encoder_decoder_position_bias = encoder_pos) - print("omw.shape",np.array(omw).shape) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , self_bias=pos, encoder_attention_bias = encoder_pos) export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos, file_name= name + "_fwd", file_format='MINDIR') + if compress: + y_num = y.asnumpy() + y_num[:,actual_seq:,:] = 0 + y = M.Tensor.from_numpy(y_num) f_y=open(f'./{name}_output.txt','w') saveCalib("output1", np.array(y), f_y)#2 dims f_y.close() -- Gitee From ea84063507ef011f9df29f16894b2e96acb35516 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Sun, 15 Jan 2023 13:00:26 +0200 Subject: [PATCH 22/39] add transformer pretrain -not work --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 10 ++-- .../delegate/tensorrt/op/encoder_tensorrt.cc | 8 ++- .../delegate/tensorrt/op/mha_tensorrt.cc | 3 +- .../optimizer/fusion/decoder_layer_fusion.cc | 54 +++++++++-------- .../optimizer/fusion/decoder_layer_fusion.h | 6 +- .../optimizer/fusion/encoder_layer_fusion.cc | 59 ++++++++++++------- .../optimizer/fusion/encoder_layer_fusion.h | 9 ++- .../fusion/multi_head_attention_fusion.cc | 29 +++++---- .../fusion/multi_head_attention_fusion.h | 4 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/ftBench.py | 5 ++ trc/transformer/models.txt | 28 +++++---- trc/transformer/train_transformer_export.py | 14 +++++ 13 files changed, 150 insertions(+), 81 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 51cd0bf8e49..91d21774a2b 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -117,9 +117,9 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn1.qkv_bias = !params.attn1.position_bias; params.attn1.projection_bias = !params.attn1.position_bias; params.attn1.is_cross = false; - params.attn1.cublas_handle = GetCublasHandle(); - + params.attn1.scalar = decoder_op->get_scalar1(); + params.attn1.mask = true; params.attn2.head_num = params.head_num; params.attn2.head_size = params.head_size; params.attn2.hidden_size = params.hidden_size; @@ -128,12 +128,12 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.projection_bias = !params.attn2.position_bias; params.attn2.is_cross = true; params.attn2.cublas_handle = GetCublasHandle(); - params.is_act = decoder_op->get_act().c_str();//params.attn1.position_bias ? "relu" : "gelu" ; + params.attn2.scalar = decoder_op->get_scalar2(); + params.attn2.mask = true; + params.is_act = decoder_op->get_act().c_str(); params.has_beta = !params.attn1.position_bias; params.has_bias = !params.attn1.position_bias; params.ffn_bias = !params.attn1.position_bias; - params.attn1.scalar = decoder_op->get_scalar1(); - params.attn2.scalar = decoder_op->get_scalar2(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.attn1.position_bias) ? C13NUM : C18NUM; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 7be2fb4f8a5..233b25d8ea7 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -38,7 +38,7 @@ constexpr std::size_t kTwo = 2; int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { - if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM) { + if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM && in_tensors.size() != C13NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; } @@ -123,8 +123,9 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.has_beta = !params.attn.position_bias; params.has_bias = !params.attn.position_bias; params.ffn_bias = !params.attn.position_bias; - params.is_act = encoder_op->get_act().c_str();//params.attn.position_bias ? "relu" : "gelu";//encoder_op->get_act(); - params.attn.scalar = encoder_op->get_scalar();//!params.attn.position_bias; + params.attn.mask = true; + params.is_act = encoder_op->get_act().c_str(); + params.attn.scalar = encoder_op->get_scalar(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; @@ -226,6 +227,7 @@ void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, params_.attn.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; + if(num_of_inputs_ == C13NUM) params_.attn.mask = false; } size_t EncoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index dbeb5ed610f..3a6d394a4ca 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -81,7 +81,8 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { params.projection_bias = !is_position_bias; params.is_cross = is_cross; params.position_bias = is_position_bias; - params.scalar = !is_position_bias; + params.scalar = mha_op->get_scalar(); + params.mask = true; auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); const int input_number = inputs().size(); diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 22f459d7881..84252f52359 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -146,24 +146,26 @@ VectorRef DecoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, Var } VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, - bool is_position_bias = false) const { + bool is_position_bias = false, bool mask =true) const { auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); auto var1 = std::make_shared("var1-reshape"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); - VectorRef attention, attention_cross, tuple2, tuple3, matmul2, tuple4, tuple5; + VectorRef inputs, attention_cross, tuple2, tuple3, matmul2, tuple4, tuple5; if (is_position_bias) { - attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + auto inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, position_bias_, mask_}); + weight_attn_o_, position_bias_}); } else { - attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + auto inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, bias_attn_qkv_, bias_attn_o_, mask_}); + weight_attn_o_, bias_attn_qkv_, bias_attn_o_}); } + if(mask)inputs.push_back(mask_); + auto attention = VectorRef(inputs); if (is_position_bias) { tuple4 = attention; } else { @@ -250,6 +252,8 @@ std::unordered_map DecoderLayerFusion::DefinePatterns() patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, true, false); patterns[kPatternDecoderT5Pre] = DefinePatternDecoderLayer(false, false, true); patterns[kPatternDecoderT5Post] = DefinePatternDecoderLayer(true, false, true); + patterns[kPatternDecoderLayerWhitoutMaskPre] = DefinePatternDecoderLayer(false, true, false, false); + patterns[kPatternDecoderLayerWhitoutMaskPost] = DefinePatternDecoderLayer(true, true, false, false); return patterns; } @@ -260,14 +264,19 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi } if (pattern_name == kPatternDecoderT5Pre || pattern_name == kPatternDecoderT5Post) { is_position_bias_ = true; - } else if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderLayerPost) { + } else if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderLayerPost || + pattern_name == kPatternDecoderLayerWhitoutMaskPre|| pattern_name == kPatternDecoderLayerWhitoutMaskPost) { is_layernorm_fusion_ = true; } if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderT5Pre) { - return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, false); + return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, false, true); } else if (pattern_name == kPatternDecoderLayerPost || pattern_name == kPatternDecoderT5Post) { - return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, true); + return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, true, true); } + if (pattern_name == kPatternDecoderLayerWhitoutMaskPre) + return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, false, false); + if (pattern_name == kPatternDecoderLayerWhitoutMaskPost) + return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, true, false); return nullptr; } @@ -432,7 +441,7 @@ std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGrap CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const AnfNodePtr &node, - bool post_layernorm = true) const { + bool post_layernorm = true, bool mask = true) const { MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); @@ -466,7 +475,7 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP auto gamma1 = utils::cast((*equiv)[gamma1_]); auto gamma2 = utils::cast((*equiv)[gamma2_]); auto gamma3 = utils::cast((*equiv)[gamma3_]); - input_mask = utils::cast((*equiv)[mask_]); + input_mask = mask ? utils::cast((*equiv)[mask_]) : nullptr; auto cross_mask = utils::cast((*equiv)[cross_mask_]); auto base_shape_ptr = weight_m->Shape(); MS_EXCEPTION_IF_NULL(base_shape_ptr); @@ -481,20 +490,17 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr); auto value_node = NewValueNode(decoder_layer_prim_c); MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); - std::vector new_node_inputs; + std::vector new_node_inputs = {value_node, input, gamma1}; if (is_position_bias_) { - new_node_inputs = { - value_node, input, gamma1, weight_qkv, input_mask, position_bias, weight_attn_o, - gamma2, encoder_output, weight_attn_q, weight_attn_kv, cross_mask, position_bias_cross, weight_attn_cross_o, - gamma3, weight_m, weight_p}; + new_node_inputs.insert(new_node_inputs.end(),{weight_qkv, input_mask}); + if(mask)new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(),{position_bias, weight_attn_o, + gamma2, encoder_output, weight_attn_q, weight_attn_kv, cross_mask, position_bias_cross, weight_attn_cross_o, + gamma3, weight_m, weight_p}); } else { - new_node_inputs = {value_node, - input, - gamma1, - beta1, - weight_qkv, - bias_attn_qkv, - input_mask, + new_node_inputs.insert(new_node_inputs.end(),{beta1, weight_qkv, bias_attn_qkv}); + if(mask)new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(),{input_mask, weight_attn_o, bias_attn_o, gamma2, @@ -511,7 +517,7 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP weight_m, bias_m, weight_p, - bias_p}; + bias_p}); } auto new_node = func_graph->NewCNode(new_node_inputs); MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h index f8ebc223fa6..03cc90f2fbc 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -45,11 +45,11 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { virtual bool Init() const; private: - VectorRef DefinePatternDecoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; + VectorRef DefinePatternDecoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias, bool mask) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; VectorRef DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const; CNodePtr CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, - const AnfNodePtr &node, bool post_layernorm) const; + const AnfNodePtr &node, bool post_layernorm, bool mask) const; std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, @@ -64,6 +64,8 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { const std::string kPatternDecoderLayerPost = "PatternDecoderLayerPost"; const std::string kPatternDecoderT5Pre = "PatternDecoderT5Pre"; const std::string kPatternDecoderT5Post = "PatternDecoderT5Post"; + const std::string kPatternDecoderLayerWhitoutMaskPre = "kPatternDecoderLayerWhitoutMaskPre"; + const std::string kPatternDecoderLayerWhitoutMaskPost = "kPatternDecoderLayerWhitoutMaskPost"; mutable VarPtr hidden_stats_{nullptr}; mutable VarPtr encoder_output_{nullptr}; mutable VarPtr position_bias_{nullptr}; diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index 8aea7f7c043..f722513b7f8 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -139,24 +139,27 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef i } VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, - bool is_position_bias = false) const { - VectorRef attention, tuple, tuple2, tuple3, reshape2, matmul1; + bool is_position_bias = false, bool mask =true) const { + VectorRef tuple, tuple2, tuple3, reshape2, matmul1,inputs; auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); auto var1 = std::make_shared("var1"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, input_, var1}); if (!is_position_bias) { - attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, bias_attn_qkv_, bias_attn_o_, mask_}); + weight_attn_o_, bias_attn_qkv_, bias_attn_o_}); } else { - attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, position_bias_, mask_}); + weight_attn_o_, position_bias_}); } + if(mask)inputs.push_back(mask_); + auto attention = VectorRef(inputs); + // return attention; if (!is_position_bias) { auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); auto var_tuple = std::make_shared("var_tuple"); @@ -244,6 +247,10 @@ std::unordered_map EncoderLayerFusion::DefinePatterns() patterns[kPatternTEncoderLayerPreNorm] = DefinePatternEncoderLayer(false, true); patterns[kPatternEncoderLayerT5Pre] = DefinePatternEncoderLayer(false, false, true); patterns[kPatternEncoderLayerT5Post] = DefinePatternEncoderLayer(true, false, true); + patterns[kPatternTEncoderLayerWhitoutMaskPre] = DefinePatternEncoderLayer(false, false, false, false); + patterns[kPatternTEncoderLayerWhitoutMaskPost] = DefinePatternEncoderLayer(true, false, false, false); + patterns[kPatternTEncoderLayerWhitoutMaskPostNorm] = DefinePatternEncoderLayer(true, true, false, false); + patterns[kPatternTEncoderLayerWhitoutMaskPreNorm] = DefinePatternEncoderLayer(false, true, false, false); return patterns; } @@ -252,15 +259,24 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } + std::cout << "found pattern=" << pattern_name << std::endl; if (pattern_name == kPatternTEncoderLayerPostNorm || pattern_name == kPatternTEncoderLayerPreNorm) is_layernorm_fusion_ = true; + if (pattern_name == kPatternTEncoderLayerWhitoutMaskPostNorm || pattern_name == kPatternTEncoderLayerWhitoutMaskPreNorm) + is_layernorm_fusion_ = true; if (pattern_name == kPatternEncoderLayerT5Pre || pattern_name == kPatternEncoderLayerT5Post) is_position_bias_ = true; if (pattern_name == kPatternTEncoderLayerPost || pattern_name == kPatternTEncoderLayerPostNorm || pattern_name == kPatternEncoderLayerT5Post) - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true); + return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node,true, true); + if (pattern_name == kPatternTEncoderLayerWhitoutMaskPostNorm || pattern_name == kPatternTEncoderLayerWhitoutMaskPost) + { + return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true, false); + } + if( pattern_name == kPatternTEncoderLayerWhitoutMaskPre || pattern_name == kPatternTEncoderLayerWhitoutMaskPreNorm) + return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false, false); else if (pattern_name == kPatternTEncoderLayerPre || pattern_name == kPatternTEncoderLayerPreNorm || pattern_name == kPatternEncoderLayerT5Pre) - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false); + return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node,false, true); return nullptr; } @@ -392,14 +408,13 @@ std::shared_ptr EncoderLayerFusion::CreatePrim(const FuncGrap if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &scalar)) { return nullptr; } - std::cout<<"act fusion"<Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm, scalar ,act_); return encoder_layer_prim; } CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const AnfNodePtr &node, - bool post_layernorm = true) const { + bool post_layernorm, bool mask) const { MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); @@ -419,9 +434,7 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP } auto gamma1 = utils::cast((*equiv)[gamma1_]); auto gamma2 = utils::cast((*equiv)[gamma2_]); - if (mask_) { - input_mask = utils::cast((*equiv)[mask_]); - } + input_mask = mask ? utils::cast((*equiv)[mask_]) : nullptr; auto base_shape_ptr = weight_m->Shape(); MS_EXCEPTION_IF_NULL(base_shape_ptr); auto input_shape_ptr = base_shape_ptr->cast(); @@ -435,19 +448,23 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP MS_CHECK_TRUE_RET(encoder_layer_prim_c != nullptr, nullptr); auto value_node = NewValueNode(encoder_layer_prim_c); MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); - std::vector new_node_inputs; + std::vector new_node_inputs = {value_node, input}; if (is_position_bias_) { position_bias = utils::cast((*equiv)[position_bias_]); - new_node_inputs = {value_node, input, gamma1, weight_qkv, input_mask, - position_bias, weight_attn_o, gamma2, weight_m, weight_p}; + new_node_inputs.insert(new_node_inputs.end(),{gamma1, weight_qkv}); + if(mask)new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(),{position_bias, weight_attn_o, gamma2, weight_m, weight_p}); } else { if (!post_layernorm) { - new_node_inputs = {value_node, input, gamma1, beta1, weight_qkv, bias_attn_qkv, input_mask, weight_attn_o, - bias_attn_o, gamma2, beta2, weight_m, bias_m, weight_p, bias_p}; + new_node_inputs.insert(new_node_inputs.end(),{gamma1, beta1,weight_qkv, bias_attn_qkv}); + if(mask)new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(),{weight_attn_o, + bias_attn_o, gamma2, beta2, weight_m, bias_m,weight_p, bias_p}); } else { - new_node_inputs = {value_node, input, weight_qkv, bias_attn_qkv, input_mask, - weight_attn_o, bias_attn_o, gamma1, beta1, weight_m, - bias_m, weight_p, bias_p, gamma2, beta2}; + new_node_inputs.insert(new_node_inputs.end(),{weight_qkv, bias_attn_qkv}); + if(mask)new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(),{weight_attn_o, bias_attn_o, gamma1,beta1, weight_m, + bias_m, weight_p, bias_p, gamma2, beta2}); } } auto new_node = func_graph->NewCNode(new_node_inputs); diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index 81f9074e555..4cdd4d65640 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -51,11 +51,16 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { const std::string kPatternTEncoderLayerPreNorm = "PatternTEncoderLayerPreNorm"; const std::string kPatternEncoderLayerT5Pre = "PatternEncoderLayerT5Pre"; const std::string kPatternEncoderLayerT5Post = "PatternEncoderLayerT5Post"; - VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_) const; + const std::string kPatternTEncoderLayerWhitoutMaskPre = "kPatternTEncoderLayerWhitoutMaskPre"; + const std::string kPatternTEncoderLayerWhitoutMaskPost = "kPatternTEncoderLayerWhitoutMaskPost"; + const std::string kPatternTEncoderLayerWhitoutMaskPostNorm = "kPatternTEncoderLayerWhitoutMaskPostNorm"; + const std::string kPatternTEncoderLayerWhitoutMaskPreNorm = "kPatternTEncoderLayerWhitoutMaskPreNorm"; + + VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_, bool mask) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; VectorRef DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const; CNodePtr CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, - const AnfNodePtr &node, bool post_layernorm) const; + const AnfNodePtr &node, bool post_layernorm = true, bool mask = true) const; AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const VarPtr &input_prim) const; lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index 638bb854637..565b6102d8d 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -385,7 +385,7 @@ VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternT5New(bool transpose, return matmul3; } -VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA() const { +VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA(bool mask) const { VectorRef k_embedding, v_embedding; auto q_transpose = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTranspose)); MS_CHECK_TRUE_RET(q_transpose != nullptr, {}); @@ -398,14 +398,21 @@ VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA() const { auto is_matmul1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion)); MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {}); auto matmul1 = VectorRef({is_matmul1, q_embedding, k_embedding}); - auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion)); - MS_CHECK_TRUE_RET(is_add != nullptr, {}); - auto mask = DefineMask(mask_); - MS_CHECK_TRUE_RET(!mask.empty(), {}); - auto add = VectorRef({is_add, mask, matmul1}); - auto is_softmax = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSoftmax)); - MS_CHECK_TRUE_RET(is_softmax != nullptr, {}); - auto softmax = VectorRef({is_softmax, add}); + VectorRef softmax; + if (mask) { + auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion)); + MS_CHECK_TRUE_RET(is_add != nullptr, {}); + auto mask = DefineMask(mask_); + MS_CHECK_TRUE_RET(!mask.empty(), {}); + auto add = VectorRef({is_add, mask, matmul1}); + auto is_softmax = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSoftmax)); + MS_CHECK_TRUE_RET(is_softmax != nullptr, {}); + softmax = VectorRef({is_softmax, add}); + } else { + auto is_softmax = std::make_shared(std::bind(IsOpType, p1, prim::kPrimSoftmax)); + MS_CHECK_TRUE_RET(is_softmax != nullptr, {}); + softmax = VectorRef({is_softmax, matmul1}); + } auto is_matmul2 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimMatMulFusion)); MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {}); auto matmul2 = VectorRef({is_matmul2, softmax, v_embedding}); @@ -573,6 +580,7 @@ std::unordered_map MultiHeadAttentionFusion::DefinePatte patterns[kMPAWithMaskPatternName] = DefineMPWithMaskPattern(); patterns[kMPAPatternName] = DefineMPWithMaskPattern(false); patterns[kMPAWithMaskPatternNamePA] = DefineMPWithMaskPatternPA(); + patterns[kMPAPatternNamePA] = DefineMPWithMaskPatternPA(false); patterns[kMPAWithMaskPatternNameT5] = DefineMPWithMaskPatternT5(); patterns[kMPAWithMaskPatternNameT5New] = DefineMPWithMaskPatternT5New(false); patterns[kMPAWithMaskPatternNameT5New2] = DefineMPWithMaskPatternT5New(true, true); @@ -625,7 +633,7 @@ AnfNodePtr MultiHeadAttentionFusion::Process(const std::string &pattern_name, co } return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), true); } - if (pattern_name == kMPAPatternName || pattern_name == kMPAPatternNameSwin1 || pattern_name == kMPAPatternNameSwin2) + if (pattern_name == kMPAPatternName || pattern_name == kMPAPatternNameSwin1 || pattern_name == kMPAPatternNameSwin2 || pattern_name == kMPAPatternNamePA) return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), false); return nullptr; } @@ -822,6 +830,7 @@ std::vector MultiHeadAttentionFusion::GetNewNodeInputs(const EquivPt auto input_q = utils::cast((*equiv)[input_q_]); auto input_k = utils::cast((*equiv)[input_k_]); auto input_v = utils::cast((*equiv)[input_v_]); + std::cout<<"mask: "<((*equiv)[mask_]) : nullptr; AnfNodePtr position_bias = (t5_x_) ? utils::cast((*equiv)[position_bias_]) : nullptr; diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h index ffa0376b5a7..5500da9f1f4 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h @@ -48,7 +48,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass { private: // define patterns VectorRef DefineMPWithMaskPattern(bool mask = true) const; - VectorRef DefineMPWithMaskPatternPA() const; + VectorRef DefineMPWithMaskPatternPA(bool mask = true) const; VectorRef DefineMPWithMaskPatternT5() const; VectorRef DefineMPWithMaskPatternT5New(bool transpose = true, bool no_div_flag = false) const; VectorRef DefineMPPatternSwin(bool flag = true) const; @@ -91,7 +91,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass { const std::string kMPAWithMaskTransposePatternNameT5New = "MPAWithMaskTransposePatternT5New"; const std::string kMPAPatternNameSwin1 = "MPAPatternNameSwin1"; const std::string kMPAPatternNameSwin2 = "MPAPatternNameSwin2"; - + const std::string kMPAPatternNamePA = "kMPAPatternNamePA"; mutable VarPtr input_q_{nullptr}; mutable VarPtr input_k_{nullptr}; mutable VarPtr input_v_{nullptr}; diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index cc543ad3d77..0185dddfb26 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_decoder_layer,128];token_type_ids:[transformer_decoder_layer,128];input_mask:[transformer_decoder_layer,128] +input_shape=input_ids:[vit,128];token_type_ids:[vit,128];input_mask:[vit,128] diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 5037376a671..c8096af4da2 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -73,6 +73,7 @@ def find_output_name(ms_model, output_file): with open(output_file, 'w') as file: file.write(data) print(outpus_name) +numcount=3 for line_model_arg in models_arg: if line_model_arg[0] == '#' or line_model_arg == '\n': continue line_model_arg=line_model_arg[:-1] @@ -131,6 +132,10 @@ for line_model_arg in models_arg: os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") # os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) + os.system(f"mkdir {base}/trc/transformer/{model_name}{numcount}") + os.system(f"cp {base}/trc/transformer/{model_name}* {base}/trc/transformer/{model_name}{numcount}/") + numcount+=1 + elif app=='trc': #if loop count =1 app=be else app = runtime diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 9fc98ac07d6..a9679d203df 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -12,15 +12,15 @@ #run the following tests before push --b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 --b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross --b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross --b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 --b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross - --b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer --b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer --b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer +#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +#-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +# +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -m T5 @@ -55,7 +55,7 @@ #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -H 4 -S 1024 -p 0 -m mha_x1 --b 1 -l 2 -s 12 -t 12 -H 2 -S 4 -m T5 +-b 1 -l 2 -s 12 -t 12 -H 2 -S 4 -m transformer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 4 -S 512 -s 128 -f 3072 -P 1 -m transformer_encoder_layer @@ -146,3 +146,11 @@ #-b 64 -l 12 -H 12 -S 768 -s 128 -m bert #-b 64 -l 24 -H 16 -S 1024 -s 128 -m bert #-b 64 -l 24 -H 16 -S 1024 -s 512 -m bert + + +# T5 tests +#-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m T5 +#-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m T5 +# +#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m T5 +#-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m T5 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 6f7d8b6f900..551594ef619 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -919,6 +919,20 @@ def T5_create(): name = "T5" str=" " os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " ) +def vit_create(): + repo = git.Repo('.', search_parent_directories=True) + base = repo.working_tree_dir + name = "vit" + str=" " + os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " ) +def transformer_create(): + repo = git.Repo('.', search_parent_directories=True) + base = repo.working_tree_dir + name = "transformer" + str=" " + os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " ) + + def mha_T5_create(): # M.context.set_auto_parallel_context(parallel_mode=M.ParallelMode.SEMI_AUTO_PARALLEL) M.context.set_context(mode=M.context.PYNATIVE_MODE) -- Gitee From e5fcb87493f1d684e4e235d79b1091d1db8f9233 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Mon, 16 Jan 2023 10:55:14 +0200 Subject: [PATCH 23/39] check fusion decoder --- .../optimizer/fusion/decoder_layer_fusion.cc | 35 ++++++++++++------- .../optimizer/fusion/decoder_layer_fusion.h | 2 ++ trc/transformer/cfg_bert.config | 2 +- trc/transformer/models.txt | 2 +- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 84252f52359..56331b46609 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -152,14 +152,14 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr auto var1 = std::make_shared("var1-reshape"); MS_CHECK_TRUE_RET(var1 != nullptr, {}); auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1}); - VectorRef inputs, attention_cross, tuple2, tuple3, matmul2, tuple4, tuple5; + VectorRef inputs, input_cross, tuple2, tuple3, matmul2, tuple4, tuple5; if (is_position_bias) { - auto inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, position_bias_}); } else { - auto inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), + inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, bias_attn_qkv_, bias_attn_o_}); @@ -191,12 +191,14 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr MS_CHECK_TRUE_RET(var2 != nullptr, {}); auto reshape2 = VectorRef({is_reshape2, encoder_output_, var2}); if (is_position_bias) { - attention_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, - weight_attn_cross_o_, position_bias_cross_, cross_mask_}); + input_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, + weight_attn_cross_o_, position_bias_cross_}); } else { - attention_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, - weight_attn_cross_o_, bias_attn_cross_qkv_, bias_attn_cross_o_, cross_mask_}); + input_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, + weight_attn_cross_o_, bias_attn_cross_qkv_, bias_attn_cross_o_}); } + if(mask)inputs.push_back(cross_mask_); + auto attention_cross = VectorRef(inputs); if (is_position_bias) { tuple5 = attention_cross; } else { @@ -250,6 +252,8 @@ std::unordered_map DecoderLayerFusion::DefinePatterns() } patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, true, false); patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, true, false); + patterns[kPatternDecoderLayerNormPre] = DefinePatternDecoderLayer(false, false, false); + patterns[kPatternDecoderLayerNormPost] = DefinePatternDecoderLayer(true, false, false); patterns[kPatternDecoderT5Pre] = DefinePatternDecoderLayer(false, false, true); patterns[kPatternDecoderT5Post] = DefinePatternDecoderLayer(true, false, true); patterns[kPatternDecoderLayerWhitoutMaskPre] = DefinePatternDecoderLayer(false, true, false, false); @@ -262,15 +266,18 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } + std::cout << "found pattern=" << pattern_name << std::endl; if (pattern_name == kPatternDecoderT5Pre || pattern_name == kPatternDecoderT5Post) { is_position_bias_ = true; } else if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderLayerPost || pattern_name == kPatternDecoderLayerWhitoutMaskPre|| pattern_name == kPatternDecoderLayerWhitoutMaskPost) { is_layernorm_fusion_ = true; } - if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderT5Pre) { + if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderT5Pre || + pattern_name == kPatternDecoderLayerNormPre) { return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, false, true); - } else if (pattern_name == kPatternDecoderLayerPost || pattern_name == kPatternDecoderT5Post) { + } else if (pattern_name == kPatternDecoderLayerPost || pattern_name == kPatternDecoderT5Post || + pattern_name == kPatternDecoderLayerNormPost) { return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, true, true); } if (pattern_name == kPatternDecoderLayerWhitoutMaskPre) @@ -495,7 +502,9 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP new_node_inputs.insert(new_node_inputs.end(),{weight_qkv, input_mask}); if(mask)new_node_inputs.push_back(input_mask); new_node_inputs.insert(new_node_inputs.end(),{position_bias, weight_attn_o, - gamma2, encoder_output, weight_attn_q, weight_attn_kv, cross_mask, position_bias_cross, weight_attn_cross_o, + gamma2, encoder_output, weight_attn_q, weight_attn_kv}); + if(mask)new_node_inputs.push_back(cross_mask); + new_node_inputs.insert(new_node_inputs.end(),{position_bias_cross, weight_attn_cross_o, gamma3, weight_m, weight_p}); } else { new_node_inputs.insert(new_node_inputs.end(),{beta1, weight_qkv, bias_attn_qkv}); @@ -508,9 +517,9 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP encoder_output, weight_attn_q, weight_attn_kv, - bias_attn_cross_qkv, - cross_mask, - weight_attn_cross_o, + bias_attn_cross_qkv}); + if(mask)new_node_inputs.push_back(cross_mask); + new_node_inputs.insert(new_node_inputs.end(),{weight_attn_cross_o, bias_attn_cross_o, gamma3, beta3, diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h index 03cc90f2fbc..97669b68075 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -62,6 +62,8 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { protected: const std::string kPatternDecoderLayerPre = "PatternDecoderLayerPre"; const std::string kPatternDecoderLayerPost = "PatternDecoderLayerPost"; + const std::string kPatternDecoderLayerNormPre = "kPatternDecoderLayerNormPre"; + const std::string kPatternDecoderLayerNormPost = "kPatternDecoderLayerNormPost"; const std::string kPatternDecoderT5Pre = "PatternDecoderT5Pre"; const std::string kPatternDecoderT5Post = "PatternDecoderT5Post"; const std::string kPatternDecoderLayerWhitoutMaskPre = "kPatternDecoderLayerWhitoutMaskPre"; diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 0185dddfb26..cc543ad3d77 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[vit,128];token_type_ids:[vit,128];input_mask:[vit,128] +input_shape=input_ids:[transformer_decoder_layer,128];token_type_ids:[transformer_decoder_layer,128];input_mask:[transformer_decoder_layer,128] diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index a9679d203df..de4ccb47930 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -55,7 +55,7 @@ #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -H 4 -S 1024 -p 0 -m mha_x1 --b 1 -l 2 -s 12 -t 12 -H 2 -S 4 -m transformer +#-b 1 -l 2 -s 12 -t 12 -H 2 -S 4 -m transformer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 4 -S 512 -s 128 -f 3072 -P 1 -m transformer_encoder_layer -- Gitee From 65d7f6632e2d95775df13c428e832f5b74854bee Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Mon, 16 Jan 2023 14:26:12 +0200 Subject: [PATCH 24/39] for merge --- .../delegate/tensorrt/op/decoder_tensorrt.h | 2 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 2 + .../delegate/tensorrt/op/encoder_tensorrt.h | 2 +- .../optimizer/fusion/decoder_layer_fusion.cc | 21 ++++---- .../fusion/multi_head_attention_fusion.cc | 1 - trc/transformer/cfg_bert.config | 2 +- trc/transformer/convert_fp32.sh | 33 ++++++++++-- trc/transformer/deploy.sh | 8 +-- trc/transformer/models.txt | 13 +++-- trc/transformer/t.config | 2 +- trc/transformer/train_transformer_export.py | 54 +++++++++---------- 11 files changed, 87 insertions(+), 53 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index d79dd6f61c0..2bf6cc645fd 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -42,7 +42,7 @@ class DecoderTensorRT : public TensorRTOp { private: nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); - bool is_ffn_fp16_ = false; + bool is_ffn_fp16_ = true; }; constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"}; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 233b25d8ea7..2f03dce282a 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -228,6 +228,8 @@ void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; if(num_of_inputs_ == C13NUM) params_.attn.mask = false; + std::cout<<"params_.attn.mask"< DecoderLayerFusion::DefinePatterns() MS_LOG(ERROR) << "initial member failed."; return patterns; } - patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, true, false); - patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, true, false); - patterns[kPatternDecoderLayerNormPre] = DefinePatternDecoderLayer(false, false, false); - patterns[kPatternDecoderLayerNormPost] = DefinePatternDecoderLayer(true, false, false); - patterns[kPatternDecoderT5Pre] = DefinePatternDecoderLayer(false, false, true); - patterns[kPatternDecoderT5Post] = DefinePatternDecoderLayer(true, false, true); + patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, true, false, true); + patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, true, false, true); + patterns[kPatternDecoderLayerNormPre] = DefinePatternDecoderLayer(false, false, false, true); + patterns[kPatternDecoderLayerNormPost] = DefinePatternDecoderLayer(true, false, false, true); + patterns[kPatternDecoderT5Pre] = DefinePatternDecoderLayer(false, false, true, true); + patterns[kPatternDecoderT5Post] = DefinePatternDecoderLayer(true, false, true, true); patterns[kPatternDecoderLayerWhitoutMaskPre] = DefinePatternDecoderLayer(false, true, false, false); patterns[kPatternDecoderLayerWhitoutMaskPost] = DefinePatternDecoderLayer(true, true, false, false); return patterns; @@ -499,7 +500,7 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); std::vector new_node_inputs = {value_node, input, gamma1}; if (is_position_bias_) { - new_node_inputs.insert(new_node_inputs.end(),{weight_qkv, input_mask}); + new_node_inputs.insert(new_node_inputs.end(),{weight_qkv}); if(mask)new_node_inputs.push_back(input_mask); new_node_inputs.insert(new_node_inputs.end(),{position_bias, weight_attn_o, gamma2, encoder_output, weight_attn_q, weight_attn_kv}); @@ -509,7 +510,7 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP } else { new_node_inputs.insert(new_node_inputs.end(),{beta1, weight_qkv, bias_attn_qkv}); if(mask)new_node_inputs.push_back(input_mask); - new_node_inputs.insert(new_node_inputs.end(),{input_mask, + new_node_inputs.insert(new_node_inputs.end(),{ weight_attn_o, bias_attn_o, gamma2, diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index 565b6102d8d..3d532a3523d 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -830,7 +830,6 @@ std::vector MultiHeadAttentionFusion::GetNewNodeInputs(const EquivPt auto input_q = utils::cast((*equiv)[input_q_]); auto input_k = utils::cast((*equiv)[input_k_]); auto input_v = utils::cast((*equiv)[input_v_]); - std::cout<<"mask: "<((*equiv)[mask_]) : nullptr; AnfNodePtr position_bias = (t5_x_) ? utils::cast((*equiv)[position_bias_]) : nullptr; diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index cc543ad3d77..b496e5915bc 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_decoder_layer,128];token_type_ids:[transformer_decoder_layer,128];input_mask:[transformer_decoder_layer,128] +input_shape=input_ids:[mha_T5_cross,128];token_type_ids:[mha_T5_cross,128];input_mask:[mha_T5_cross,128] diff --git a/trc/transformer/convert_fp32.sh b/trc/transformer/convert_fp32.sh index 81e9f5d98cb..0b988990762 100755 --- a/trc/transformer/convert_fp32.sh +++ b/trc/transformer/convert_fp32.sh @@ -1,9 +1,34 @@ +#!/bin/bash + base=`git rev-parse --show-toplevel` version=$(cat ${base}/version.txt) +fusion=true +while getopts "n" opt ; do + echo "fusion" + case "${opt}" in + n) + fusion=false ;; + *) + echo "Unknown option ${opt}!" ;; + esac + done +# case $1 in +# n) +# fusion=false ;shift;; +# *) +# echo "Unknown option ${opt}!" ;; +# esac +# if [ "${fusion}" = "true" ]; then +# optimize="--optimizeTransformer=true" +# fi +shift $(($OPTIND - 1)) file_name=$(basename $1) file_name="${file_name%.*}" -#dbg="gdb --args " - +echo "${file_name%.*}" +# dbg="gdb --args " +if [ "${fusion}" = "true" ]; then + optimize="--optimizeTransformer=true" +fi #GLOG_v=0 \ lib_base=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 LD_LIBRARY_PATH=${lib_base}/tools/converter/lib:${lib_base}/tools/converter/third_party/glog/lib \ @@ -14,7 +39,7 @@ ${dbg} ${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linu --outputFile=${base}/trc/transformer/convv_${file_name} \ --configFile=${base}/trc/transformer/t.config \ --encryption=false \ - --optimizeTransformer=true \ + ${optimize} \ --exportMindIR=MINDIR if [ "${dbg}" = "" ] @@ -27,5 +52,5 @@ ${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64/t --outputFile=${base}/trc/transformer/convv_${file_name} \ --configFile=${base}/trc/transformer/t.config \ --encryption=false \ - --optimizeTransformer=true + ${optimize} fi diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 728b521ac2d..2dcf4c5c5fd 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -35,7 +35,7 @@ rsync -v ${benchmark} ${server}:${benchmark} rsync -vl ${system}/runtime/lib/* ${server}:${system}/runtime/lib/ rsync -vl ${system}/tools/converter/lib/* ${server}:${system}/tools/converter/lib/ echo -e "[gpu_context]\ninput_shape=input_ids:[${batch_size},128];token_type_ids:[${batch_size},128];input_mask:[${batch_size},128]" > ./cfg_bert.config -rsync -v cfg_${model_name}.config ${server}:$(realpath "cfg_${model_name}.config") +# rsync -v cfg_${model_name}.config ${server}:$(realpath "cfg_${model_name}.config") # this should be more general ! # output_files=$(find . -maxdepth 1 -name ${model}_compress_output"*.txt*" | sort -n) @@ -55,9 +55,9 @@ then command+=" --benchmarkDataFile=\"${output_files}\" " fi -if [ -f cfg_${model_name}.config ]; then - command+="--configFile=cfg_${model_name}.config " -fi +# if [ -f cfg_${model_name}.config ]; then +# command+="--configFile=cfg_${model_name}.config " +# fi command+="--device=GPU " #command+="--enableFp16=true" echo command=${command} diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index de4ccb47930..688e44cc80f 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,7 +1,7 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 @@ -23,7 +23,7 @@ #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert -#-b 1 -l 12 -H 12 -S 768 -s 128 -m T5 +-b 1 -l 3 -H 12 -S 768 -s 128 -m T5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -f 3072 -x 0 -m transformer_encoder_layer_t5 @@ -154,3 +154,10 @@ # #-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m T5 #-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m T5 + +# transformer tests +#-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m transformer +#-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m transformer +# +#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m transformer +#-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m transformer \ No newline at end of file diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 5cfcbc353ed..1f045260cc0 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,4 +1,4 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" #fusion_blacklists="EncoderLayerFusion","DecoderLayerFusion" -#fusion_blacklists="DecoderLayerFusion" +fusion_blacklists="EncoderLayerFusion" diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 551594ef619..ce592014e4e 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -345,14 +345,14 @@ def transformer_encoder_layer_t5_create(): name = "transformer_encoder_layer_t5" if (post_layernorm): print("post_layernorm") - model = T.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, - num_heads=head_num, post_layernorm_residual=True, has_bias=True, hidden_act=None,attention_dropout_rate=0.0,hidden_dropout_rate=0.0) + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, post_layernorm_residual=True, has_bias=True, hidden_act='relu') else: - model = T.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, - num_heads=head_num, has_bias=True, hidden_act=None,attention_dropout_rate=0.0,hidden_dropout_rate=0.0) + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, has_bias=True, hidden_act='relu') encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) - # pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) # encoder_input_value = M.Tensor(np.zeros((batch, seq, hid_size)), M.float32) # encoder_input_mask = M.Tensor(np.zeros((batch, seq, seq)), M.float32) q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size @@ -365,30 +365,30 @@ def transformer_encoder_layer_t5_create(): wp = model.attention.projection.weight omw = model.output.mapping.weight opw = model.output.projection.weight - gl1 = model.layernorm1.gamma - gl2 = model.layernorm2.gamma + gl1 = model.layernorm1.weight + gl2 = model.layernorm2.weight suffix = str(compute_type) suffix = suffix[-2:] saveT(encoder_input_value, name + "_input1.fp" + suffix) saveT(encoder_input_mask, name + "_input2.fp" + suffix) - # saveT(pos, name + "_input3.fp" + suffix) - # saveT(gl1, name + "_weight1.fp" + suffix) - # saveT(wt, name + "_weight2.fp" + suffix) - # saveT(wp, name + "_weight3.fp" + suffix) - # saveT(gl2, name + "_weight4.fp" + suffix) - # if ffn_fp16 == True: - # saveTensorToHalf(omw, name + "_weight5.fp" + "16") - # saveTensorToHalf(opw, name + "_weight6.fp" + "16") - # else: - # saveT(omw, name + "_weight5.fp" + suffix) - # saveT(opw, name + "_weight6.fp" + suffix) + saveT(pos, name + "_input3.fp" + suffix) + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(wt, name + "_weight2.fp" + suffix) + saveT(wp, name + "_weight3.fp" + suffix) + saveT(gl2, name + "_weight4.fp" + suffix) + if ffn_fp16 == True: + saveTensorToHalf(omw, name + "_weight5.fp" + "16") + saveTensorToHalf(opw, name + "_weight6.fp" + "16") + else: + saveT(omw, name + "_weight5.fp" + suffix) + saveT(opw, name + "_weight6.fp" + suffix) _cell_graph_executor.compile(model, encoder_input_value, - encoder_input_mask) - y = model(encoder_input_value, encoder_input_mask) + encoder_input_mask,pos) + y = model(encoder_input_value, encoder_input_mask,pos) print('name=',name) - export(model, encoder_input_value, encoder_input_mask, file_name= name + "_fwd", file_format='MINDIR') + export(model, encoder_input_value, encoder_input_mask,pos, file_name= name + "_fwd", file_format='MINDIR') # if app=="ch": f_y=open(f'./{name}_output.txt','w') out_name='output1' @@ -405,11 +405,11 @@ def transformer_decoder_layer_t5_create(): name = "transformer_decoder_layer_t5" if (post_layernorm): print("post_layernorm true") - model = T.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False, hidden_act="relu") else: print("post_layernorm false") - model = T.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False, hidden_act="relu") hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) @@ -458,9 +458,9 @@ def transformer_decoder_layer_t5_create(): print('omw.asnumpy().shape',omw.asnumpy().shape) opw = model.output.projection.weight - gl1 = model.layernorm1.gamma - gl2 = model.layernorm2.gamma - gl3 = model.cross_attention_layernorm.gamma + gl1 = model.layernorm1.weight + gl2 = model.layernorm2.weight + gl3 = model.cross_attention_layernorm.weight suffix = str(compute_type) suffix = suffix[-2:] @@ -485,7 +485,7 @@ def transformer_decoder_layer_t5_create(): saveT(pos, name + "_input5.fp" + suffix) saveT(encoder_pos, name + "_input6.fp" + suffix) _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos) - y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , self_bias=pos, encoder_attention_bias = encoder_pos) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , pos, encoder_pos) export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos, file_name= name + "_fwd", file_format='MINDIR') if compress: y_num = y.asnumpy() -- Gitee From 9f19b7b44dd0794e23adc00f791725275bb75af8 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Mon, 16 Jan 2023 14:28:36 +0200 Subject: [PATCH 25/39] merge the branch --- graphengine | 2 +- mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/graphengine b/graphengine index 23600180612..b56450bde6d 160000 --- a/graphengine +++ b/graphengine @@ -1 +1 @@ -Subproject commit 236001806129e36c0f48b240c4f61b2e1d92c470 +Subproject commit b56450bde6d5afa1c557437ebf154487afe355f0 diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 56331b46609..c5dddc70e13 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -254,10 +254,6 @@ std::unordered_map DecoderLayerFusion::DefinePatterns() patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, true, false); patterns[kPatternDecoderLayerNormPre] = DefinePatternDecoderLayer(false, false, false); patterns[kPatternDecoderLayerNormPost] = DefinePatternDecoderLayer(true, false, false); - patterns[kPatternDecoderT5Pre] = DefinePatternDecoderLayer(false, false, true); - patterns[kPatternDecoderT5Post] = DefinePatternDecoderLayer(true, false, true); - patterns[kPatternDecoderLayerWhitoutMaskPre] = DefinePatternDecoderLayer(false, true, false, false); - patterns[kPatternDecoderLayerWhitoutMaskPost] = DefinePatternDecoderLayer(true, true, false, false); return patterns; } -- Gitee From bf2983f6503f64d22864c9ea81b50129231caa9c Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 18 Jan 2023 10:18:57 +0200 Subject: [PATCH 26/39] code review --- .../kernel/nnacl/decoder_layer_parameter.h | 2 +- .../kernel/nnacl/infer/decoder_layer_infer.h | 9 +- .../core/load_mindir/anf_model_parser.cc | 21 ++-- mindspore/core/ops/attention.cc | 4 +- mindspore/core/ops/decoder_layer.cc | 39 +++---- mindspore/core/ops/decoder_layer.h | 14 +-- mindspore/core/ops/encoder_layer.cc | 8 +- .../ops/populate/decoder_layer_populate.cc | 1 - .../ops/populate/encoder_layer_populate.cc | 1 - .../delegate/tensorrt/op/decoder_tensorrt.cc | 4 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 11 +- .../delegate/tensorrt/op/mha_tensorrt.cc | 5 +- .../lite/tools/converter/anf_transform.cc | 2 +- .../optimizer/fusion/decoder_layer_fusion.cc | 103 ++++++++---------- .../optimizer/fusion/decoder_layer_fusion.h | 7 +- .../optimizer/fusion/encoder_layer_fusion.cc | 85 +++++++-------- .../optimizer/fusion/encoder_layer_fusion.h | 8 +- .../fusion/multi_head_attention_fusion.cc | 7 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 2 +- trc/transformer/ftBench.py | 10 +- trc/transformer/models.txt | 10 +- trc/transformer/t.config | 2 +- trc/transformer/train_transformer_export.py | 74 ++++++------- 24 files changed, 198 insertions(+), 233 deletions(-) mode change 100755 => 100644 mindspore/core/load_mindir/anf_model_parser.cc diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h index 3535e5f55e2..fd0fb8c7e6b 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h @@ -32,7 +32,7 @@ typedef struct DecoderLayerParameter { bool position_bias2_; bool scalar1; bool scalar2; - char* act; + char *act; } DecoderLayerParameter; #endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h index fb93172b030..facdcc50662 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef MINDSPORE_NNACL_DECODERLAYER_INFER_H -#define MINDSPORE_NNACL_DECODERLAYER_INFER_H +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_INFER_DECODER_LAYER_INFER_H_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_INFER_DECODER_LAYER_INFER_H_ #include "nnacl/infer/common_infer.h" @@ -23,10 +23,9 @@ extern "C" { #endif int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size, - OpParameter *parameter); + OpParameter *parameter); #ifdef __cplusplus } #endif -#endif // MINDSPORE_NNACL_DECODERLAYER_INFER_H - +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_INFER_DECODER_LAYER_INFER_H_ diff --git a/mindspore/core/load_mindir/anf_model_parser.cc b/mindspore/core/load_mindir/anf_model_parser.cc old mode 100755 new mode 100644 index a1822d121ac..13873e302d0 --- a/mindspore/core/load_mindir/anf_model_parser.cc +++ b/mindspore/core/load_mindir/anf_model_parser.cc @@ -27,11 +27,11 @@ #include #ifdef __has_include #if __has_include() -#include -namespace fs = std :: filesystem ; +#include +namespace fs = std ::filesystem; #else -#include -namespace fs = std :: experimental :: filesystem ; +#include +namespace fs = std ::experimental ::filesystem; #endif #endif #include "ir/tensor.h" @@ -686,18 +686,17 @@ bool MSANFModelParser::GetTensorDataFromExternal(const mind_ir::TensorProto &ten if (it != tenor_data_.end()) { data = it->second.get(); } else { - fs :: path path { mindir_path_ }; + fs ::path path{mindir_path_}; std::string convv = "convv"; std::string file; std::cout << "mindir_path_=" << mindir_path_ << std::endl; if (mindir_path_.find(convv) != std::string::npos) { - file = path.root_directory ( ) . string ( ) + path.stem ( ). string ( ) + "_variables" + "/" + - tensor_proto . external_data (). location (); - std::cout << "file=" << file << std::endl; + file = path.root_directory().string() + path.stem().string() + "_variables" + "/" + + tensor_proto.external_data().location(); + std::cout << "file=" << file << std::endl; } else { - - file = mindir_path_ + "/" + tensor_proto.external_data().location(); - std::cout << "file=" << file << std::endl; + file = mindir_path_ + "/" + tensor_proto.external_data().location(); + std::cout << "file=" << file << std::endl; } if (mindir_dec_key_ != nullptr) { size_t plain_len; diff --git a/mindspore/core/ops/attention.cc b/mindspore/core/ops/attention.cc index 464ad1b4abc..cf03ca26f3b 100644 --- a/mindspore/core/ops/attention.cc +++ b/mindspore/core/ops/attention.cc @@ -34,9 +34,7 @@ void Attention::set_cross(bool cross) { (void)this->AddAttr(kCross, api::MakeVal void Attention::set_position_bias(bool position_bias) { (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias)); } -void Attention::set_scalar(bool scalar) { - (void)this->AddAttr(kScalar, api::MakeValue(scalar)); -} +void Attention::set_scalar(bool scalar) { (void)this->AddAttr(kScalar, api::MakeValue(scalar)); } int64_t Attention::get_head_num() const { auto value_ptr = this->GetAttr(kAttentionNumHeads); return GetValue(value_ptr); diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc index ea64ed3710f..771d087c410 100644 --- a/mindspore/core/ops/decoder_layer.cc +++ b/mindspore/core/ops/decoder_layer.cc @@ -23,7 +23,9 @@ namespace mindspore::ops { MIND_API_OPERATOR_IMPL(DecoderLayer, BaseOperator); -void DecoderLayer::set_head_num(int64_t head_num) { (void)this->AddAttr(kDecoderLayerNumHeads, api::MakeValue(head_num)); } +void DecoderLayer::set_head_num(int64_t head_num) { + (void)this->AddAttr(kDecoderLayerNumHeads, api::MakeValue(head_num)); +} void DecoderLayer::set_head_size(int64_t head_size) { (void)this->AddAttr(kDecoderLayerSizePerHead, api::MakeValue(head_size)); @@ -32,29 +34,28 @@ void DecoderLayer::set_head_size(int64_t head_size) { void DecoderLayer::set_post_layernorm(bool post_layernorm) { (void)this->AddAttr(kDecoderLayerPostLayernorm, api::MakeValue(post_layernorm)); } - void DecoderLayer::set_eps_layernorm1(float eps_layernorm1) { +void DecoderLayer::set_eps_layernorm1(float eps_layernorm1) { (void)this->AddAttr(kDecoderLayerEpsLayerNorm1, api::MakeValue(eps_layernorm1)); } - void DecoderLayer::set_eps_layernorm2(float eps_layernorm2) { - (void)this->AddAttr(kDecoderLayerEpsLayerNorm2, api::MakeValue(eps_layernorm2)); - +void DecoderLayer::set_eps_layernorm2(float eps_layernorm2) { + (void)this->AddAttr(kDecoderLayerEpsLayerNorm2, api::MakeValue(eps_layernorm2)); } void DecoderLayer::set_eps_layernorm3(float eps_layernorm3) { - (void)this->AddAttr(kDecoderLayerEpsLayerNorm3, api::MakeValue(eps_layernorm3)); - + (void)this->AddAttr(kDecoderLayerEpsLayerNorm3, api::MakeValue(eps_layernorm3)); } - void DecoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size){ +void DecoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) { (void)this->AddAttr(kDecoderLayerFfnHiddenSize, api::MakeValue(ffn_hidden_size)); } -void DecoderLayer::set_position_bias1(bool position_bias1) { (void)this->AddAttr(kDecoderLayerPositionBias1, api::MakeValue(position_bias1)); } -void DecoderLayer::set_position_bias2(bool position_bias2) { (void)this->AddAttr(kDecoderLayerPositionBias2, api::MakeValue(position_bias2)); } -void DecoderLayer::set_scalar1(bool scalar1) { - (void)this->AddAttr(kDecoderLayerScalar1, api::MakeValue(scalar1)); +void DecoderLayer::set_position_bias1(bool position_bias1) { + (void)this->AddAttr(kDecoderLayerPositionBias1, api::MakeValue(position_bias1)); } -void DecoderLayer::set_scalar2(bool scalar2) { - (void)this->AddAttr(kDecoderLayerScalar2, api::MakeValue(scalar2)); +void DecoderLayer::set_position_bias2(bool position_bias2) { + (void)this->AddAttr(kDecoderLayerPositionBias2, api::MakeValue(position_bias2)); } +void DecoderLayer::set_scalar1(bool scalar1) { (void)this->AddAttr(kDecoderLayerScalar1, api::MakeValue(scalar1)); } +void DecoderLayer::set_scalar2(bool scalar2) { (void)this->AddAttr(kDecoderLayerScalar2, api::MakeValue(scalar2)); } void DecoderLayer::set_act(std::string act) { + MS_ASSERT(act == 'relu' || act == 'gelu' || act == 'no_act'); (void)this->AddAttr(kActivation, api::MakeValue(act)); } int64_t DecoderLayer::get_head_num() const { @@ -67,7 +68,6 @@ int64_t DecoderLayer::get_head_size() const { return GetValue(value_ptr); } - bool DecoderLayer::get_post_layernorm() const { auto value_ptr = this->GetAttr(kDecoderLayerPostLayernorm); return GetValue(value_ptr); @@ -76,11 +76,11 @@ float DecoderLayer::get_eps_layernorm1() const { auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm1); return GetValue(value_ptr); } -float DecoderLayer::get_eps_layernorm2() const{ +float DecoderLayer::get_eps_layernorm2() const { auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm2); return GetValue(value_ptr); } -float DecoderLayer::get_eps_layernorm3() const{ +float DecoderLayer::get_eps_layernorm3() const { auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm3); return GetValue(value_ptr); } @@ -109,8 +109,9 @@ std::string DecoderLayer::get_act() const { return GetValue(value_ptr); } -void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, int64_t ffn_hidden_size, - bool position_bias1, bool position_bias2, bool post_layernorm = false, bool scalar1, bool scalar2, std::string act) { +void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, + float eps_layernorm3, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, + bool post_layernorm = false, bool scalar1, bool scalar2, std::string act) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h index 493f24965ad..82de065e3e6 100644 --- a/mindspore/core/ops/decoder_layer.h +++ b/mindspore/core/ops/decoder_layer.h @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef LITE_MINDSPORE_LITE_TOOLS_CONVERTER_OPS_DECODERLAYER_H_ -#define LITE_MINDSPORE_LITE_TOOLS_CONVERTER_OPS_DECODERLAYER_H_ +#ifndef MINDSPORE_CORE_OPS_DECODER_LAYER_H_ +#define MINDSPORE_CORE_OPS_DECODER_LAYER_H_ #include #include #include @@ -66,13 +66,13 @@ class MIND_API DecoderLayer : public BaseOperator { /// \param[in] ffn_hidden_size Define ffn hidden size. /// \param[in] position_bias1 Define position_bias1. /// \param[in] position_bias2 Define position_bias2. - /// \param[in] scalar1 Define scalar1. + /// \param[in] scalar1 Define scalar1. /// \param[in] scalar2 Define scalar2. /// \param[in] act Define act void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, - int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, - bool post_layernorm, bool scalar1 = true, bool scalar2 = true, std::string act = "gelu"); - void set_head_num(int64_t head_num); + int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, bool post_layernorm, bool scalar1 = true, + bool scalar2 = true, std::string act = "gelu"); + void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_post_layernorm(bool post_layernorm); void set_eps_layernorm1(float eps_layernorm1); @@ -99,4 +99,4 @@ class MIND_API DecoderLayer : public BaseOperator { }; } // namespace ops } // namespace mindspore -#endif // LITE_MINDSPORE_LITE_TOOLS_CONVERTER_OPS_ATTENTION_H_ +#endif // MINDSPORE_CORE_OPS_DECODER_LAYER_H_ diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc index 3416213f8bf..f82f6e73ea0 100644 --- a/mindspore/core/ops/encoder_layer.cc +++ b/mindspore/core/ops/encoder_layer.cc @@ -46,10 +46,9 @@ void EncoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) { void EncoderLayer::set_position_bias(bool position_bias) { (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias)); } -void EncoderLayer::set_scalar(bool scalar) { - (void)this->AddAttr(kScalar, api::MakeValue(scalar)); -} +void EncoderLayer::set_scalar(bool scalar) { (void)this->AddAttr(kScalar, api::MakeValue(scalar)); } void EncoderLayer::set_act(std::string act) { + MS_ASSERT(act == 'relu' || act == 'gelu' || act == 'no_act'); (void)this->AddAttr(kActivation, api::MakeValue(act)); } int64_t EncoderLayer::get_head_num() const { @@ -91,7 +90,8 @@ std::string EncoderLayer::get_act() const { return GetValue(value_ptr); } void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, - int64_t ffn_hidden_size, bool position_bias, bool post_layernorm = false, bool scalar = true, std::string act = "gelu") { + int64_t ffn_hidden_size, bool position_bias, bool post_layernorm = false, bool scalar = true, + std::string act = "gelu") { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); diff --git a/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc b/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc index bbafa50d8b6..fae724a3b02 100644 --- a/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc +++ b/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc @@ -49,4 +49,3 @@ OpParameter *PopulateDecoderLayerParameter(const void *prim) { REG_POPULATE(PrimitiveType_DecoderLayer, PopulateDecoderLayerParameter, SCHEMA_CUR) } // namespace lite } // namespace mindspore - diff --git a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc b/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc index 0aedd56fdc8..ab025a50f88 100644 --- a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc +++ b/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc @@ -47,4 +47,3 @@ OpParameter *PopulateEncoderLayerParameter(const void *prim) { REG_POPULATE(PrimitiveType_EncoderLayer, PopulateEncoderLayerParameter, SCHEMA_CUR) } // namespace lite } // namespace mindspore - diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 91d21774a2b..e00cc13cd4f 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -128,7 +128,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.projection_bias = !params.attn2.position_bias; params.attn2.is_cross = true; params.attn2.cublas_handle = GetCublasHandle(); - params.attn2.scalar = decoder_op->get_scalar2(); + params.attn2.scalar = decoder_op->get_scalar2(); params.attn2.mask = true; params.is_act = decoder_op->get_act().c_str(); params.has_beta = !params.attn1.position_bias; @@ -264,7 +264,7 @@ nvinfer1::DimsExprs DecoderPlugin::getOutputDimensions(int32_t index, const nvin } nvinfer1::IPluginV2DynamicExt *DecoderPlugin::clone() const noexcept { - auto *plugin = new DecoderPlugin(*this); // TODO(haim) CopyConstructor + auto *plugin = new DecoderPlugin(*this); if (plugin == nullptr) { MS_LOG(ERROR) << "plugin is null"; return nullptr; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 2f03dce282a..c869cc0315f 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -111,7 +111,6 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.ffn_fp16 = is_ffn_fp16_; params.cublas_handle = GetCublasHandle(); params.hidden_size = params.head_num * params.head_size; - params.attn.head_num = encoder_op->get_head_num(); params.attn.head_size = encoder_op->get_head_size(); params.attn.cublas_handle = GetCublasHandle(); @@ -124,8 +123,8 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.has_bias = !params.attn.position_bias; params.ffn_bias = !params.attn.position_bias; params.attn.mask = true; - params.is_act = encoder_op->get_act().c_str(); - params.attn.scalar = encoder_op->get_scalar(); + params.is_act = encoder_op->get_act().c_str(); + params.attn.scalar = encoder_op->get_scalar(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; @@ -227,9 +226,7 @@ void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, params_.attn.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; - if(num_of_inputs_ == C13NUM) params_.attn.mask = false; - std::cout<<"params_.attn.mask"<setName((op_name_ + "plugin_attention").c_str()); nvinfer1::ITensor *attn_tensor = mha_layer->getOutput(0); #ifndef TEST_ @@ -266,7 +263,7 @@ nvinfer1::DimsExprs MhaPlugin::getOutputDimensions(int32_t index, const nvinfer1 } nvinfer1::IPluginV2DynamicExt *MhaPlugin::clone() const noexcept { - auto *plugin = new MhaPlugin(*this); // TODO(haim) CopyConstructor + auto *plugin = new MhaPlugin(*this); if (plugin == nullptr) { MS_LOG(ERROR) << "plugin is null"; return nullptr; diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc index 692429d841c..bb2203bd365 100644 --- a/mindspore/lite/tools/converter/anf_transform.cc +++ b/mindspore/lite/tools/converter/anf_transform.cc @@ -334,7 +334,7 @@ int AnfTransform::RunFusionPass(const FuncGraphPtr &old_graph, const std::shared MS_CHECK_TRUE_RET(pass_ptr != nullptr, RET_ERROR); auto pass_name = pass_ptr->name(); if (param->fusion_blacklists.find(pass_name) != param->fusion_blacklists.end()) { - std::cout<< "Disable fusion: " << pass_name; + std::cout << "Disable fusion: " << pass_name; MS_LOG(INFO) << "Disable fusion: " << pass_name; continue; diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index c8c8f304cf6..c5ad6a75225 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -146,7 +146,7 @@ VectorRef DecoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, Var } VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, - bool is_position_bias = false, bool mask =true) const { + bool is_position_bias = false, bool mask = true) const { auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); auto var1 = std::make_shared("var1-reshape"); @@ -155,16 +155,16 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr VectorRef inputs, input_cross, tuple2, tuple3, matmul2, tuple4, tuple5; if (is_position_bias) { inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, position_bias_}); + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, + position_bias_}); } else { inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, bias_attn_qkv_, bias_attn_o_}); + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, + bias_attn_qkv_, bias_attn_o_}); } - if(mask)inputs.push_back(mask_); + if (mask) inputs.push_back(mask_); auto attention = VectorRef(inputs); // return attention; if (is_position_bias) { @@ -193,12 +193,12 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr auto reshape2 = VectorRef({is_reshape2, encoder_output_, var2}); if (is_position_bias) { input_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, - weight_attn_cross_o_, position_bias_cross_}); + weight_attn_cross_o_, position_bias_cross_}); } else { input_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_, - weight_attn_cross_o_, bias_attn_cross_qkv_, bias_attn_cross_o_}); + weight_attn_cross_o_, bias_attn_cross_qkv_, bias_attn_cross_o_}); } - if(mask)input_cross.push_back(cross_mask_); + if (mask) input_cross.push_back(cross_mask_); auto attention_cross = VectorRef(input_cross); if (is_position_bias) { tuple5 = attention_cross; @@ -267,26 +267,23 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - std::cout << "found pattern=" << pattern_name << std::endl; if (pattern_name == kPatternDecoderT5Pre || pattern_name == kPatternDecoderT5Post) { is_position_bias_ = true; - } else if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderLayerPost || - pattern_name == kPatternDecoderLayerWhitoutMaskPre|| pattern_name == kPatternDecoderLayerWhitoutMaskPost) { + } + if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderLayerPost || + pattern_name == kPatternDecoderLayerWhitoutMaskPre || pattern_name == kPatternDecoderLayerWhitoutMaskPost) { is_layernorm_fusion_ = true; } - if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderT5Pre || - pattern_name == kPatternDecoderLayerNormPre) { - return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, false, true); - } else if (pattern_name == kPatternDecoderLayerPost || pattern_name == kPatternDecoderT5Post || - pattern_name == kPatternDecoderLayerNormPost) { - return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, true, true); + bool mask = true; + bool post_layernorm = false; + if (pattern_name == kPatternDecoderLayerPost || pattern_name == kPatternDecoderT5Post || + pattern_name == kPatternDecoderLayerNormPost || pattern_name == kPatternDecoderLayerWhitoutMaskPost) { + post_layernorm = true; } - if (pattern_name == kPatternDecoderLayerWhitoutMaskPre) - return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, false, false); - if (pattern_name == kPatternDecoderLayerWhitoutMaskPost) - return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, true, false); - return nullptr; -} + if (pattern_name == kPatternDecoderLayerWhitoutMaskPre || pattern_name == kPatternDecoderLayerWhitoutMaskPost) + mask = false; + return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, post_layernorm, mask); +} // namespace mindspore::opt bool DecoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const { auto act_input = GetAttribute(func_graph, equiv, is_act_); @@ -377,7 +374,7 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (attn_cross_prim->GetAttr(ops::kPositionBias) != nullptr) { *is_position_bias2 = attn_cross_prim->get_position_bias(); } - if (attn_cross_prim->GetAttr(ops::kScalar) != nullptr) { + if (attn_cross_prim->GetAttr(ops::kScalar) != nullptr) { *scalar2 = attn_cross_prim->get_scalar(); } if (is_layernorm_fusion_) { @@ -414,10 +411,9 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (!IsActGELU(func_graph, equiv)) { return RET_ERROR; } - act_ = "gelu"; - } - else{ - act_= "relu"; + act_ = "gelu"; + } else { + act_ = "relu"; } return RET_OK; } @@ -448,8 +444,8 @@ std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGrap } CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, - const AnfNodePtr &node, - bool post_layernorm = true, bool mask = true) const { + const AnfNodePtr &node, bool post_layernorm = true, + bool mask = true) const { MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); @@ -500,34 +496,21 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP MS_CHECK_TRUE_RET(value_node != nullptr, nullptr); std::vector new_node_inputs = {value_node, input, gamma1}; if (is_position_bias_) { - new_node_inputs.insert(new_node_inputs.end(),{weight_qkv}); - if(mask)new_node_inputs.push_back(input_mask); - new_node_inputs.insert(new_node_inputs.end(),{position_bias, weight_attn_o, - gamma2, encoder_output, weight_attn_q, weight_attn_kv}); - if(mask)new_node_inputs.push_back(cross_mask); - new_node_inputs.insert(new_node_inputs.end(),{position_bias_cross, weight_attn_cross_o, - gamma3, weight_m, weight_p}); + new_node_inputs.insert(new_node_inputs.end(), {weight_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), + {position_bias, weight_attn_o, gamma2, encoder_output, weight_attn_q, weight_attn_kv}); + if (mask) new_node_inputs.push_back(cross_mask); + new_node_inputs.insert(new_node_inputs.end(), + {position_bias_cross, weight_attn_cross_o, gamma3, weight_m, weight_p}); } else { - new_node_inputs.insert(new_node_inputs.end(),{beta1, weight_qkv, bias_attn_qkv}); - if(mask)new_node_inputs.push_back(input_mask); - new_node_inputs.insert(new_node_inputs.end(),{ - weight_attn_o, - bias_attn_o, - gamma2, - beta2, - encoder_output, - weight_attn_q, - weight_attn_kv, - bias_attn_cross_qkv}); - if(mask)new_node_inputs.push_back(cross_mask); - new_node_inputs.insert(new_node_inputs.end(),{weight_attn_cross_o, - bias_attn_cross_o, - gamma3, - beta3, - weight_m, - bias_m, - weight_p, - bias_p}); + new_node_inputs.insert(new_node_inputs.end(), {beta1, weight_qkv, bias_attn_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), {weight_attn_o, bias_attn_o, gamma2, beta2, encoder_output, + weight_attn_q, weight_attn_kv, bias_attn_cross_qkv}); + if (mask) new_node_inputs.push_back(cross_mask); + new_node_inputs.insert(new_node_inputs.end(), + {weight_attn_cross_o, bias_attn_cross_o, gamma3, beta3, weight_m, bias_m, weight_p, bias_p}); } auto new_node = func_graph->NewCNode(new_node_inputs); MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h index 97669b68075..c9b7a2273b0 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -45,7 +45,8 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { virtual bool Init() const; private: - VectorRef DefinePatternDecoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias, bool mask) const; + VectorRef DefinePatternDecoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias, + bool mask) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; VectorRef DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const; CNodePtr CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, @@ -53,8 +54,8 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, - float *eps1, float *eps2, float *eps3, bool *is_position_bias1, - bool *is_position_bias2, bool *scalar1, bool *scalar2) const; + float *eps1, float *eps2, float *eps3, bool *is_position_bias1, bool *is_position_bias2, + bool *scalar1, bool *scalar2) const; AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const; lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index f722513b7f8..b1331b2ca68 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -139,8 +139,8 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef i } VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, - bool is_position_bias = false, bool mask =true) const { - VectorRef tuple, tuple2, tuple3, reshape2, matmul1,inputs; + bool is_position_bias = false, bool mask = true) const { + VectorRef tuple, tuple2, tuple3, reshape2, matmul1, inputs; auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); auto var1 = std::make_shared("var1"); @@ -148,16 +148,16 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto reshape1 = VectorRef({is_reshape1, input_, var1}); if (!is_position_bias) { inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, bias_attn_qkv_, bias_attn_o_}); + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, + bias_attn_qkv_, bias_attn_o_}); } else { inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), - getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, - weight_attn_o_, position_bias_}); + getTuple(post_layernorm, layernorm_fusion, is_position_bias), + getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, + position_bias_}); } - if(mask)inputs.push_back(mask_); + if (mask) inputs.push_back(mask_); auto attention = VectorRef(inputs); // return attention; if (!is_position_bias) { @@ -247,8 +247,6 @@ std::unordered_map EncoderLayerFusion::DefinePatterns() patterns[kPatternTEncoderLayerPreNorm] = DefinePatternEncoderLayer(false, true); patterns[kPatternEncoderLayerT5Pre] = DefinePatternEncoderLayer(false, false, true); patterns[kPatternEncoderLayerT5Post] = DefinePatternEncoderLayer(true, false, true); - patterns[kPatternTEncoderLayerWhitoutMaskPre] = DefinePatternEncoderLayer(false, false, false, false); - patterns[kPatternTEncoderLayerWhitoutMaskPost] = DefinePatternEncoderLayer(true, false, false, false); patterns[kPatternTEncoderLayerWhitoutMaskPostNorm] = DefinePatternEncoderLayer(true, true, false, false); patterns[kPatternTEncoderLayerWhitoutMaskPreNorm] = DefinePatternEncoderLayer(false, true, false, false); return patterns; @@ -259,25 +257,20 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - std::cout << "found pattern=" << pattern_name << std::endl; - if (pattern_name == kPatternTEncoderLayerPostNorm || pattern_name == kPatternTEncoderLayerPreNorm) - is_layernorm_fusion_ = true; - if (pattern_name == kPatternTEncoderLayerWhitoutMaskPostNorm || pattern_name == kPatternTEncoderLayerWhitoutMaskPreNorm) + if (pattern_name == kPatternTEncoderLayerPostNorm || pattern_name == kPatternTEncoderLayerPreNorm || + pattern_name == kPatternTEncoderLayerWhitoutMaskPostNorm || + pattern_name == kPatternTEncoderLayerWhitoutMaskPreNorm) is_layernorm_fusion_ = true; if (pattern_name == kPatternEncoderLayerT5Pre || pattern_name == kPatternEncoderLayerT5Post) is_position_bias_ = true; + bool mask = true; + bool post_layernorm = false; if (pattern_name == kPatternTEncoderLayerPost || pattern_name == kPatternTEncoderLayerPostNorm || - pattern_name == kPatternEncoderLayerT5Post) - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node,true, true); - if (pattern_name == kPatternTEncoderLayerWhitoutMaskPostNorm || pattern_name == kPatternTEncoderLayerWhitoutMaskPost) - { - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true, false); - } - if( pattern_name == kPatternTEncoderLayerWhitoutMaskPre || pattern_name == kPatternTEncoderLayerWhitoutMaskPreNorm) - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false, false); - else if (pattern_name == kPatternTEncoderLayerPre || pattern_name == kPatternTEncoderLayerPreNorm || - pattern_name == kPatternEncoderLayerT5Pre) - return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node,false, true); - return nullptr; + pattern_name == kPatternEncoderLayerT5Post || pattern_name == kPatternTEncoderLayerWhitoutMaskPostNorm) + post_layernorm = true; + if (pattern_name == kPatternTEncoderLayerWhitoutMaskPostNorm || + pattern_name == kPatternTEncoderLayerWhitoutMaskPreNorm) + mask = false; + return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, post_layernorm, mask); } bool EncoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, @@ -356,7 +349,7 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { is_position_bias_ = attn_prim->get_position_bias(); } - if (attn_prim->GetAttr(ops::kScalar) != nullptr) { + if (attn_prim->GetAttr(ops::kScalar) != nullptr) { *scalar = attn_prim->get_scalar(); } if (is_layernorm_fusion_) { @@ -385,10 +378,9 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (!IsActGELU(func_graph, equiv, is_act_)) { return RET_ERROR; } - act_= "gelu"; - } - else{ - act_= "relu"; + act_ = "gelu"; + } else { + act_ = "relu"; } return RET_OK; } @@ -408,13 +400,14 @@ std::shared_ptr EncoderLayerFusion::CreatePrim(const FuncGrap if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &scalar)) { return nullptr; } - encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm, scalar ,act_); + encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm, scalar, + act_); return encoder_layer_prim; } CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, - const AnfNodePtr &node, - bool post_layernorm, bool mask) const { + const AnfNodePtr &node, bool post_layernorm, + bool mask) const { MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); @@ -451,20 +444,20 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP std::vector new_node_inputs = {value_node, input}; if (is_position_bias_) { position_bias = utils::cast((*equiv)[position_bias_]); - new_node_inputs.insert(new_node_inputs.end(),{gamma1, weight_qkv}); - if(mask)new_node_inputs.push_back(input_mask); - new_node_inputs.insert(new_node_inputs.end(),{position_bias, weight_attn_o, gamma2, weight_m, weight_p}); + new_node_inputs.insert(new_node_inputs.end(), {gamma1, weight_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), {position_bias, weight_attn_o, gamma2, weight_m, weight_p}); } else { if (!post_layernorm) { - new_node_inputs.insert(new_node_inputs.end(),{gamma1, beta1,weight_qkv, bias_attn_qkv}); - if(mask)new_node_inputs.push_back(input_mask); - new_node_inputs.insert(new_node_inputs.end(),{weight_attn_o, - bias_attn_o, gamma2, beta2, weight_m, bias_m,weight_p, bias_p}); + new_node_inputs.insert(new_node_inputs.end(), {gamma1, beta1, weight_qkv, bias_attn_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), + {weight_attn_o, bias_attn_o, gamma2, beta2, weight_m, bias_m, weight_p, bias_p}); } else { - new_node_inputs.insert(new_node_inputs.end(),{weight_qkv, bias_attn_qkv}); - if(mask)new_node_inputs.push_back(input_mask); - new_node_inputs.insert(new_node_inputs.end(),{weight_attn_o, bias_attn_o, gamma1,beta1, weight_m, - bias_m, weight_p, bias_p, gamma2, beta2}); + new_node_inputs.insert(new_node_inputs.end(), {weight_qkv, bias_attn_qkv}); + if (mask) new_node_inputs.push_back(input_mask); + new_node_inputs.insert(new_node_inputs.end(), {weight_attn_o, bias_attn_o, gamma1, beta1, weight_m, bias_m, + weight_p, bias_p, gamma2, beta2}); } } auto new_node = func_graph->NewCNode(new_node_inputs); diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index 4cdd4d65640..f388f16f35b 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -51,16 +51,16 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { const std::string kPatternTEncoderLayerPreNorm = "PatternTEncoderLayerPreNorm"; const std::string kPatternEncoderLayerT5Pre = "PatternEncoderLayerT5Pre"; const std::string kPatternEncoderLayerT5Post = "PatternEncoderLayerT5Post"; - const std::string kPatternTEncoderLayerWhitoutMaskPre = "kPatternTEncoderLayerWhitoutMaskPre"; - const std::string kPatternTEncoderLayerWhitoutMaskPost = "kPatternTEncoderLayerWhitoutMaskPost"; const std::string kPatternTEncoderLayerWhitoutMaskPostNorm = "kPatternTEncoderLayerWhitoutMaskPostNorm"; const std::string kPatternTEncoderLayerWhitoutMaskPreNorm = "kPatternTEncoderLayerWhitoutMaskPreNorm"; - VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_, bool mask) const; + VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_, + bool mask) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; VectorRef DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const; CNodePtr CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, - const AnfNodePtr &node, bool post_layernorm = true, bool mask = true) const; + const AnfNodePtr &node, bool post_layernorm = true, + bool mask = true) const; AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const VarPtr &input_prim) const; lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index 3d532a3523d..afb4873b47e 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -399,7 +399,7 @@ VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA(bool mask) const { MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {}); auto matmul1 = VectorRef({is_matmul1, q_embedding, k_embedding}); VectorRef softmax; - if (mask) { + if (mask) { auto is_add = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion)); MS_CHECK_TRUE_RET(is_add != nullptr, {}); auto mask = DefineMask(mask_); @@ -621,8 +621,6 @@ AnfNodePtr MultiHeadAttentionFusion::Process(const std::string &pattern_name, co if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - std::cout << "found pattern=" << pattern_name << std::endl; - if ((pattern_name == kMPAWithMaskPatternName) || (pattern_name == kMPAWithMaskPatternNamePA) || (pattern_name == kMPAWithMaskPatternNameT5) || (pattern_name == kMPAWithMaskPatternNameT5New) || (pattern_name == kMPAWithMaskTransposePatternNameT5New) || (pattern_name == kMPAWithMaskPatternNameT5New2)) { @@ -633,7 +631,8 @@ AnfNodePtr MultiHeadAttentionFusion::Process(const std::string &pattern_name, co } return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), true); } - if (pattern_name == kMPAPatternName || pattern_name == kMPAPatternNameSwin1 || pattern_name == kMPAPatternNameSwin2 || pattern_name == kMPAPatternNamePA) + if (pattern_name == kMPAPatternName || pattern_name == kMPAPatternNameSwin1 || pattern_name == kMPAPatternNameSwin2 || + pattern_name == kMPAPatternNamePA) return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), false); return nullptr; } diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index b496e5915bc..099dd20effc 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[mha_T5_cross,128];token_type_ids:[mha_T5_cross,128];input_mask:[mha_T5_cross,128] +input_shape=input_ids:[T5,128];token_type_ids:[T5,128];input_mask:[T5,128] diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 2dcf4c5c5fd..059bbbd0192 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -59,7 +59,7 @@ fi # command+="--configFile=cfg_${model_name}.config " # fi command+="--device=GPU " -#command+="--enableFp16=true" +# command+="--enableFp16=true" echo command=${command} echo ${command} > execute.sh rsync -v execute.sh ${server}:${PWD} diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index c8096af4da2..cc0cfcebbb1 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -14,13 +14,12 @@ benchmark = f'{system}/tools/benchmark' work_dir=f'{base}/trc/transformer' image = "private_transformer:0.1" server = "caspi" -server = "caspi" enable_fp16 = "false" suffix="fp32" usage='enter the correct parameters: app=ch\\trc, act=runtime\\be, loop count=int>=0, server=local\\num of server\nif app=trc and act=be loop count must be 1' app='ch' act='be' -cuda_visible_dev=5 +cuda_visible_dev=6 loop_count=1 if len(sys.argv)>2 or len(sys.argv)==1: parameters=sys.argv[1:] @@ -73,7 +72,7 @@ def find_output_name(ms_model, output_file): with open(output_file, 'w') as file: file.write(data) print(outpus_name) -numcount=3 +numcount=0 for line_model_arg in models_arg: if line_model_arg[0] == '#' or line_model_arg == '\n': continue line_model_arg=line_model_arg[:-1] @@ -131,9 +130,10 @@ for line_model_arg in models_arg: os.system(f"ssh {server} 'cd {system}/.. && tar -xzf {system}/../mindspore-lite-{version}-linux-x64.tar.gz'") os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") + os.system(f"./deploy.sh convv_{model_name}_fwd.mindir run") # os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) - os.system(f"mkdir {base}/trc/transformer/{model_name}{numcount}") - os.system(f"cp {base}/trc/transformer/{model_name}* {base}/trc/transformer/{model_name}{numcount}/") + # os.system(f"mkdir {base}/trc/transformer/{model_name}{numcount}") + # os.system(f"cp {base}/trc/transformer/{model_name}* {base}/trc/transformer/{model_name}{numcount}/") numcount+=1 diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 688e44cc80f..f83557877bd 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,7 +1,7 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 @@ -23,7 +23,7 @@ #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert --b 1 -l 3 -H 12 -S 768 -s 128 -m T5 +#-b 1 -l 3 -H 12 -S 768 -s 128 -m T5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -f 3072 -x 0 -m transformer_encoder_layer_t5 @@ -91,7 +91,7 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert -#-b 1 -l 12 -H 2 -S 8 -s 20 -m T5 +#-b 1 -l 2 -H 2 -S 2 -s 128 -m T5 #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -m bert @@ -149,11 +149,11 @@ # T5 tests -#-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m T5 +-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m T5 #-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m T5 # -#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m T5 #-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m T5 +#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m T5 # transformer tests #-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m transformer diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 1f045260cc0..948d874e198 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -1,4 +1,4 @@ [registry] #fusion_blacklists="MultiHeadAttentionFusion" #fusion_blacklists="EncoderLayerFusion","DecoderLayerFusion" -fusion_blacklists="EncoderLayerFusion" +#fusion_blacklists="EncoderLayerFusion" diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index ce592014e4e..0f6b4b243f5 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -405,11 +405,11 @@ def transformer_decoder_layer_t5_create(): name = "transformer_decoder_layer_t5" if (post_layernorm): print("post_layernorm true") - model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + model = T.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False, hidden_act="relu") else: print("post_layernorm false") - model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + model = T.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False, hidden_act="relu") hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) @@ -437,47 +437,47 @@ def transformer_decoder_layer_t5_create(): encoder_pos_value = encoder_pos.asnumpy() encoder_pos_value[:,:,actual_seq:] = 0 encoder_pos = M.Tensor.from_numpy(encoder_pos_value) - q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size - k = model.attention.dense2.weight.asnumpy()#.transpose() - v = model.attention.dense3.weight.asnumpy()#.transpose() + # q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + # k = model.attention.dense2.weight.asnumpy()#.transpose() + # v = model.attention.dense3.weight.asnumpy()#.transpose() - w = np.concatenate((q, k, v)) # 3xhid_size x hid_size - w = w.transpose() # hid_size x 3xhid_size - wt = M.Tensor(w, w_compute_type) - wp = model.attention.projection.weight + # w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + # w = w.transpose() # hid_size x 3xhid_size + # wt = M.Tensor(w, w_compute_type) + # wp = model.attention.projection.weight - qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size - k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() - v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() + # qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size + # k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() + # v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() - w2 = np.concatenate((k2, v2)) # 2xhid_size x hid_size - w2 = w2.transpose() # hid_size x 2xhid_size - wt2 = M.Tensor(w2, w_compute_type) - wp2 = model.cross_attention.projection.weight - omw = model.output.mapping.weight - print('omw.asnumpy().shape',omw.asnumpy().shape) - opw = model.output.projection.weight - - gl1 = model.layernorm1.weight - gl2 = model.layernorm2.weight - gl3 = model.cross_attention_layernorm.weight + # w2 = np.concatenate((k2, v2)) # 2xhid_size x hid_size + # w2 = w2.transpose() # hid_size x 2xhid_size + # wt2 = M.Tensor(w2, w_compute_type) + # wp2 = model.cross_attention.projection.weight + # omw = model.output.mapping.weight + # print('omw.asnumpy().shape',omw.asnumpy().shape) + # opw = model.output.projection.weight + + # gl1 = model.layernorm1.weight + # gl2 = model.layernorm2.weight + # gl3 = model.cross_attention_layernorm.weight suffix = str(compute_type) suffix = suffix[-2:] - saveT(gl1, name + "_weight1.fp" + suffix) - saveT(wt, name + "_weight2.fp" + suffix) - saveT(wp, name + "_weight3.fp" + suffix) - saveT(gl2, name + "_weight4.fp" + suffix) - saveT(qt2, name + "_weight5.fp" + suffix) - saveT(wt2, name + "_weight6.fp" + suffix) - saveT(wp2, name + "_weight7.fp" + suffix) - saveT(gl3, name + "_weight8.fp" + suffix) - if(ffn_fp16): - saveTensorToHalf(omw, name + "_weight9.fp" + "16") - saveTensorToHalf(opw, name + "_weight10.fp" + "16") - else: - saveT(omw, name + "_weight9.fp" + suffix) - saveT(opw, name + "_weight10.fp" + suffix) + # saveT(gl1, name + "_weight1.fp" + suffix) + # saveT(wt, name + "_weight2.fp" + suffix) + # saveT(wp, name + "_weight3.fp" + suffix) + # saveT(gl2, name + "_weight4.fp" + suffix) + # saveT(qt2, name + "_weight5.fp" + suffix) + # saveT(wt2, name + "_weight6.fp" + suffix) + # saveT(wp2, name + "_weight7.fp" + suffix) + # saveT(gl3, name + "_weight8.fp" + suffix) + # if(ffn_fp16): + # saveTensorToHalf(omw, name + "_weight9.fp" + "16") + # saveTensorToHalf(opw, name + "_weight10.fp" + "16") + # else: + # saveT(omw, name + "_weight9.fp" + suffix) + # saveT(opw, name + "_weight10.fp" + suffix) saveT(hidden_stats, name + "_input1.fp" + suffix) saveT(decoder_mask, name + "_input2.fp" + suffix) saveT(encoder_output, name + "_input3.fp" + suffix) -- Gitee From 8034d5d822058a1dc2d0d66bd1a895add66fa948 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 18 Jan 2023 11:22:31 +0200 Subject: [PATCH 27/39] fix merge --- .../cpu/kernel/nnacl/infer/decoder_layer_infer.c | 2 +- mindspore/core/ops/core_ops.h | 1 + mindspore/lite/schema/ops.fbs | 2 ++ mindspore/lite/src/common/ops/ops_def.cc | 2 -- .../lite/src/common/ops/populate/custom_populate.cc | 10 ++++++++++ 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c index 54e5448f54a..401acdea7ab 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c @@ -34,4 +34,4 @@ int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, Ten return NNACL_OK; } -REG_INFER(DecoderLayer, PrimType_DecoderLayer, DecoderLayerInferShape) +REG_INFER(DecoderLayer, PrimType_Inner_DecoderLayer, DecoderLayerInferShape) diff --git a/mindspore/core/ops/core_ops.h b/mindspore/core/ops/core_ops.h index d03560659e9..55400240dd6 100644 --- a/mindspore/core/ops/core_ops.h +++ b/mindspore/core/ops/core_ops.h @@ -1123,6 +1123,7 @@ GVAR_DEF(PrimitivePtr, kPrimSparseApplyProximalGradientDescent, GVAR_DEF(PrimitivePtr, kPrimAdaptiveMaxPool3DGrad, std::make_shared("AdaptiveMaxPool3DGrad")); GVAR_DEF(PrimitivePtr, kPrimChannelShuffle, std::make_shared(kChannelShuffle)); GVAR_DEF(PrimitivePtr, kPrimEncoderLayer, std::make_shared("EncoderLayer")); +GVAR_DEF(PrimitivePtr, kPrimDecoderLayer, std::make_shared("DecoderLayer")); // Comm ops GVAR_DEF(PrimitivePtr, kPrimMirror, std::make_shared("_MirrorOperator")); diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index 581990ab784..bd6a2cf2eb3 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -232,6 +232,8 @@ union PrimitiveType { Log1p, TensorScatterAdd, ScatterElements, + EncoderLayer, + DecoderLayer, } table Abs { diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc index b567f7aa16a..9557ffbf7df 100644 --- a/mindspore/lite/src/common/ops/ops_def.cc +++ b/mindspore/lite/src/common/ops/ops_def.cc @@ -232,8 +232,6 @@ OP_TYPE(GroupNormFusion) OP_TYPE(Log1p) OP_TYPE(TensorScatterAdd) OP_TYPE(ScatterElements) -OP_TYPE(EncoderLayer) -OP_TYPE(DecoderLayer) OP_TYPE_DEF_END(PrimitiveType) OP_SCHEMA_DEF(Abs) diff --git a/mindspore/lite/src/common/ops/populate/custom_populate.cc b/mindspore/lite/src/common/ops/populate/custom_populate.cc index 4f855ce999a..e35357ee673 100644 --- a/mindspore/lite/src/common/ops/populate/custom_populate.cc +++ b/mindspore/lite/src/common/ops/populate/custom_populate.cc @@ -107,6 +107,16 @@ OpParameter *PopulateCustomParameter(const void *prim) { memset(param, 0, sizeof(OpParameter)); param->type_ = PrimType_Inner_EncoderLayer; return reinterpret_cast(param); + } else if (type == "DecoderLayer") { + std::cout << "DecoderLayer populate" << std::endl; + auto *param = reinterpret_cast(malloc(sizeof(OpParameter))); + if (param == nullptr) { + MS_LOG(ERROR) << "malloc DecoderLayer failed."; + return nullptr; + } + memset(param, 0, sizeof(OpParameter)); + param->type_ = PrimType_Inner_DecoderLayer; + return reinterpret_cast(param); } else { MS_LOG(ERROR) << "Unsupported custom type: " << type; } -- Gitee From 106ecdeb95972d095762deecf4885bdbda231360 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Thu, 19 Jan 2023 15:17:04 +0200 Subject: [PATCH 28/39] fix issue --- .../cpu/kernel/nnacl/attention_parameter.h | 2 +- .../kernel/nnacl/decoder_layer_parameter.h | 6 +- .../kernel/nnacl/encoder_layer_parameter.h | 4 +- .../plugin/device/cpu/kernel/nnacl/op_base.h | 2 +- .../core/load_mindir/anf_model_parser.cc | 22 +- mindspore/core/ops/attention.cc | 12 +- mindspore/core/ops/attention.h | 6 +- mindspore/core/ops/decoder_layer.cc | 38 +-- mindspore/core/ops/decoder_layer.h | 23 +- mindspore/core/ops/encoder_layer.cc | 30 +-- mindspore/core/ops/encoder_layer.h | 17 +- mindspore/core/ops/op_name.h | 4 +- mindspore/lite/schema/ops.fbs | 4 +- mindspore/lite/src/common/ops/ops_def.cc | 2 +- .../lite/src/common/ops/ops_func_declare.h | 3 - mindspore/lite/src/common/ops/ops_utils.cc | 2 - .../ops/populate/decoder_layer_populate.cc | 51 ---- .../ops/populate/encoder_layer_populate.cc | 47 ---- .../delegate/tensorrt/op/decoder_tensorrt.cc | 28 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 24 +- .../delegate/tensorrt/op/mha_tensorrt.cc | 41 +-- .../delegate/tensorrt/tensorrt_utils.h | 2 - .../delegate/tensorrt/tensorrt_utils.cc | 41 --- .../litert/delegate/tensorrt/tensorrt_utils.h | 40 --- .../lite/tools/optimizer/common/gllo_utils.cc | 5 +- .../optimizer/fusion/decoder_layer_fusion.cc | 22 +- .../optimizer/fusion/decoder_layer_fusion.h | 4 +- .../optimizer/fusion/encoder_layer_fusion.cc | 18 +- .../optimizer/fusion/encoder_layer_fusion.h | 4 +- .../fusion/multi_head_attention_fusion.cc | 5 +- .../fusion/multi_head_attention_fusion.h | 2 +- trc/transformer/convert_fp32.sh | 10 +- trc/transformer/deploy.sh | 4 +- trc/transformer/models.txt | 34 +-- trc/transformer/train_transformer_export.py | 254 +++++++++++++++++- 35 files changed, 397 insertions(+), 416 deletions(-) delete mode 100644 mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc delete mode 100644 mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h index bcfb0ab38a7..aabd3121c43 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h @@ -23,7 +23,7 @@ typedef struct AttentionParameter { int head_num_; int head_size_; bool cross_; - bool scalar_; + float sclae_; } AttentionParameter; typedef struct RelativePositionAttentionParameter { diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h index fd0fb8c7e6b..4c3254fc2f5 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h @@ -30,9 +30,9 @@ typedef struct DecoderLayerParameter { int ffn_hidden_size_; bool position_bias1_; bool position_bias2_; - bool scalar1; - bool scalar2; - char *act; + float scale1_; + float scale2_; + ActType act_type_; } DecoderLayerParameter; #endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h index df6c97d132f..4f09f5e32d4 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/encoder_layer_parameter.h @@ -28,8 +28,8 @@ typedef struct EncoderLayerParameter { float eps_layernorm2_; int ffn_hidden_size_; bool position_bias_; - bool scalar; - const char *act; + float scale_; + ActType act_type_; } EncoderLayerParameter; #endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_ENCODER_LAYER_PARAMETER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h index f2483e9c4c5..2ed8cb8cdbd 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h @@ -661,7 +661,7 @@ typedef struct QuantMulArg { } QuantMulArg; typedef enum ReductionType { Reduction_Sum, Reduction_Mean, Reduction_None } ReductionType; -typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu } ActType; +typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu, ActType_Gelu } ActType; typedef enum PadMode { Pad_pad, Pad_same, Pad_valid } PadMode; typedef enum RoundingMode { Rounding_No, Rounding_Away_from_zero, Rounding_Up } RoundingMode; typedef enum CalFixedMultiplierMode { diff --git a/mindspore/core/load_mindir/anf_model_parser.cc b/mindspore/core/load_mindir/anf_model_parser.cc index 13873e302d0..997dc89d2cb 100644 --- a/mindspore/core/load_mindir/anf_model_parser.cc +++ b/mindspore/core/load_mindir/anf_model_parser.cc @@ -25,15 +25,6 @@ #include #include #include -#ifdef __has_include -#if __has_include() -#include -namespace fs = std ::filesystem; -#else -#include -namespace fs = std ::experimental ::filesystem; -#endif -#endif #include "ir/tensor.h" #include "ir/param_info.h" #include "ir/map_tensor.h" @@ -686,18 +677,7 @@ bool MSANFModelParser::GetTensorDataFromExternal(const mind_ir::TensorProto &ten if (it != tenor_data_.end()) { data = it->second.get(); } else { - fs ::path path{mindir_path_}; - std::string convv = "convv"; - std::string file; - std::cout << "mindir_path_=" << mindir_path_ << std::endl; - if (mindir_path_.find(convv) != std::string::npos) { - file = path.root_directory().string() + path.stem().string() + "_variables" + "/" + - tensor_proto.external_data().location(); - std::cout << "file=" << file << std::endl; - } else { - file = mindir_path_ + "/" + tensor_proto.external_data().location(); - std::cout << "file=" << file << std::endl; - } + std::string file = mindir_path_ + "/" + tensor_proto.external_data().location(); if (mindir_dec_key_ != nullptr) { size_t plain_len; auto plain_data = Decrypt(&plain_len, file, mindir_dec_key_, mindir_key_size_, mindir_dec_mode_); diff --git a/mindspore/core/ops/attention.cc b/mindspore/core/ops/attention.cc index cf03ca26f3b..715bb985e28 100644 --- a/mindspore/core/ops/attention.cc +++ b/mindspore/core/ops/attention.cc @@ -34,7 +34,7 @@ void Attention::set_cross(bool cross) { (void)this->AddAttr(kCross, api::MakeVal void Attention::set_position_bias(bool position_bias) { (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias)); } -void Attention::set_scalar(bool scalar) { (void)this->AddAttr(kScalar, api::MakeValue(scalar)); } +void Attention::set_scale(float scale) { (void)this->AddAttr(kScale, api::MakeValue(scale)); } int64_t Attention::get_head_num() const { auto value_ptr = this->GetAttr(kAttentionNumHeads); return GetValue(value_ptr); @@ -54,16 +54,16 @@ bool Attention::get_position_bias() const { auto value_ptr = this->GetAttr(kPositionBias); return GetValue(value_ptr); } -bool Attention::get_scalar() const { - auto value_ptr = this->GetAttr(kScalar); - return GetValue(value_ptr); +float Attention::get_scale() const { + auto value_ptr = this->GetAttr(kScale); + return GetValue(value_ptr); } -void Attention::Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross, bool scalar) { +void Attention::Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross, float scale) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_cross(cross); this->set_position_bias(position_bias); - this->set_scalar(scalar); + this->set_scale(scale); } REGISTER_PRIMITIVE_C(kNameAttention, Attention); } // namespace mindspore::ops diff --git a/mindspore/core/ops/attention.h b/mindspore/core/ops/attention.h index 0d0cccc5d81..838c04a3817 100644 --- a/mindspore/core/ops/attention.h +++ b/mindspore/core/ops/attention.h @@ -41,17 +41,17 @@ class MIND_API Attention : public BaseOperator { /// \param[in] head_size Define size per head. /// \param[in] cross Define is cross attention. Default false. /// \param[in] position_bias Define is position bias attention. - void Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross = false, bool scalar = true); + void Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross = false, float scale = 1.0f); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_cross(bool cross); void set_position_bias(bool position_bias); - void set_scalar(bool scalar); + void set_scale(float scale); int64_t get_head_num() const; int64_t get_head_size() const; bool get_cross() const; bool get_position_bias() const; - bool get_scalar() const; + float get_scale() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc index 771d087c410..dbe996b8a4a 100644 --- a/mindspore/core/ops/decoder_layer.cc +++ b/mindspore/core/ops/decoder_layer.cc @@ -52,12 +52,9 @@ void DecoderLayer::set_position_bias1(bool position_bias1) { void DecoderLayer::set_position_bias2(bool position_bias2) { (void)this->AddAttr(kDecoderLayerPositionBias2, api::MakeValue(position_bias2)); } -void DecoderLayer::set_scalar1(bool scalar1) { (void)this->AddAttr(kDecoderLayerScalar1, api::MakeValue(scalar1)); } -void DecoderLayer::set_scalar2(bool scalar2) { (void)this->AddAttr(kDecoderLayerScalar2, api::MakeValue(scalar2)); } -void DecoderLayer::set_act(std::string act) { - MS_ASSERT(act == 'relu' || act == 'gelu' || act == 'no_act'); - (void)this->AddAttr(kActivation, api::MakeValue(act)); -} +void DecoderLayer::set_scale1(float scale1) { (void)this->AddAttr(kDecoderLayerScale1, api::MakeValue(scale1)); } +void DecoderLayer::set_scale2(float scale2) { (void)this->AddAttr(kDecoderLayerScale2, api::MakeValue(scale2)); } +void DecoderLayer::set_act_type(ActType act_type) { (void)this->AddAttr(kActivationType, api::MakeValue(act_type)); } int64_t DecoderLayer::get_head_num() const { auto value_ptr = this->GetAttr(kDecoderLayerNumHeads); return GetValue(value_ptr); @@ -96,22 +93,25 @@ bool DecoderLayer::get_position_bias2() const { auto value_ptr = this->GetAttr(kDecoderLayerPositionBias2); return GetValue(value_ptr); } -bool DecoderLayer::get_scalar1() const { - auto value_ptr = this->GetAttr(kDecoderLayerScalar1); - return GetValue(value_ptr); +float DecoderLayer::get_scale1() const { + auto value_ptr = this->GetAttr(kDecoderLayerScale1); + return GetValue(value_ptr); } -bool DecoderLayer::get_scalar2() const { - auto value_ptr = this->GetAttr(kDecoderLayerScalar2); - return GetValue(value_ptr); +float DecoderLayer::get_scale2() const { + auto value_ptr = this->GetAttr(kDecoderLayerScale2); + return GetValue(value_ptr); } -std::string DecoderLayer::get_act() const { - auto value_ptr = this->GetAttr(kActivation); - return GetValue(value_ptr); +ActType DecoderLayer::get_act_type() const { + auto value_ptr = GetAttr(ActType); + if (value_ptr == nullptr) { + return ActType::ActType_No; + } + return ActivationType(GetValue(value_ptr)); } void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, - bool post_layernorm = false, bool scalar1, bool scalar2, std::string act) { + bool post_layernorm, float scale1, float scale2, ActType act_type) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); @@ -121,9 +121,9 @@ void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm this->set_ffn_hidden_size(ffn_hidden_size); this->set_position_bias1(position_bias1); this->set_position_bias2(position_bias2); - this->set_act(act); - this->set_scalar1(scalar1); - this->set_scalar2(scalar2); + this->set_act_type(act_type); + this->set_scale1(scale1); + this->set_scale2(scale2); } REGISTER_PRIMITIVE_C(kNameDecoderLayer, DecoderLayer); } // namespace mindspore::ops diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h index 82de065e3e6..2be3a12074e 100644 --- a/mindspore/core/ops/decoder_layer.h +++ b/mindspore/core/ops/decoder_layer.h @@ -19,6 +19,7 @@ #include #include #include +#include "nnacl/op_base.h" #include "ops/base_operator.h" #include "mindapi/base/types.h" @@ -66,12 +67,12 @@ class MIND_API DecoderLayer : public BaseOperator { /// \param[in] ffn_hidden_size Define ffn hidden size. /// \param[in] position_bias1 Define position_bias1. /// \param[in] position_bias2 Define position_bias2. - /// \param[in] scalar1 Define scalar1. - /// \param[in] scalar2 Define scalar2. - /// \param[in] act Define act + /// \param[in] scale1 Define scalar1. + /// \param[in] scale2 Define scalar2. + /// \param[in] act_type Define act_type. void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, - int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, bool post_layernorm, bool scalar1 = true, - bool scalar2 = true, std::string act = "gelu"); + int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, bool post_layernorm, float scale1 = 1.0f, + float scale2 = 1.0f, ActType act_type = ActType::ActType_Gelu); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_post_layernorm(bool post_layernorm); @@ -81,9 +82,9 @@ class MIND_API DecoderLayer : public BaseOperator { void set_ffn_hidden_size(int64_t ffn_hidden_size); void set_position_bias1(bool position_bias1); void set_position_bias2(bool position_bias2); - void set_scalar1(bool scalar1); - void set_scalar2(bool scalar2); - void set_act(std::string act); + void set_scale1(float scale1); + void set_scale2(float scale2); + void set_act_type(ActType act_type); int64_t get_head_num() const; int64_t get_head_size() const; bool get_post_layernorm() const; @@ -93,9 +94,9 @@ class MIND_API DecoderLayer : public BaseOperator { int64_t get_ffn_hidden_size() const; bool get_position_bias1() const; bool get_position_bias2() const; - bool get_scalar1() const; - bool get_scalar2() const; - std::string get_act() const; + float get_scale1() const; + float get_scale2() const; + ActType get_act_type() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc index f82f6e73ea0..6faec011525 100644 --- a/mindspore/core/ops/encoder_layer.cc +++ b/mindspore/core/ops/encoder_layer.cc @@ -46,11 +46,8 @@ void EncoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) { void EncoderLayer::set_position_bias(bool position_bias) { (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias)); } -void EncoderLayer::set_scalar(bool scalar) { (void)this->AddAttr(kScalar, api::MakeValue(scalar)); } -void EncoderLayer::set_act(std::string act) { - MS_ASSERT(act == 'relu' || act == 'gelu' || act == 'no_act'); - (void)this->AddAttr(kActivation, api::MakeValue(act)); -} +void EncoderLayer::set_scale(float scale) { (void)this->AddAttr(kScale, api::MakeValue(scale)); } +void EncoderLayer::set_act_type(ActType act_type) { (void)this->AddAttr(kActivationType, api::MakeValue(act_type));} int64_t EncoderLayer::get_head_num() const { auto value_ptr = this->GetAttr(kEncoderLayerNumHeads); return GetValue(value_ptr); @@ -81,17 +78,20 @@ bool EncoderLayer::get_position_bias() const { auto value_ptr = this->GetAttr(kPositionBias); return GetValue(value_ptr); } -bool EncoderLayer::get_scalar() const { - auto value_ptr = this->GetAttr(kScalar); - return GetValue(value_ptr); +float EncoderLayer::get_scale() const { + auto value_ptr = this->GetAttr(kScale); + return GetValue(value_ptr); } -std::string EncoderLayer::get_act() const { - auto value_ptr = this->GetAttr(kActivation); - return GetValue(value_ptr); +ActType EncoderLayer::get_act_type() const { + auto value_ptr = GetAttr(ActType); + if (value_ptr == nullptr) { + return ActType::ActType_No; + } + return ActivationType(GetValue(value_ptr)); } void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, - int64_t ffn_hidden_size, bool position_bias, bool post_layernorm = false, bool scalar = true, - std::string act = "gelu") { + int64_t ffn_hidden_size, bool position_bias, bool post_layernorm, float scale, + ActType act_type) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); @@ -99,8 +99,8 @@ void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm this->set_eps_layernorm2(eps_layernorm2); this->set_ffn_hidden_size(ffn_hidden_size); this->set_position_bias(position_bias); - this->set_act(act); - this->set_scalar(scalar); + this->set_act_type(act_type); + this->set_scale(scale); } REGISTER_PRIMITIVE_C(kNameEncoderLayer, EncoderLayer); } // namespace mindspore::ops diff --git a/mindspore/core/ops/encoder_layer.h b/mindspore/core/ops/encoder_layer.h index afb73440fc5..b0466be467a 100644 --- a/mindspore/core/ops/encoder_layer.h +++ b/mindspore/core/ops/encoder_layer.h @@ -22,6 +22,7 @@ #include "ops/base_operator.h" #include "mindapi/base/types.h" +#include "nnacl/op_base.h" namespace mindspore { namespace ops { @@ -42,11 +43,11 @@ class MIND_API EncoderLayer : public BaseOperator { /// \param[in] eps_layernorm1 Define eps layernorm1. /// \param[in] eps_layernorm2 Define eps layernorm2. /// \param[in] ffn_hidden_size Define ffn hidden size. - /// \param[in] position_bias Define ffn. - /// \param[in] scalar Define scalar. - /// \param[in] act Define act. + /// \param[in] position_bias Define position_bias. + /// \param[in] scale Define scale. + /// \param[in] act_type Define act_type. void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, - bool position_bias, bool post_layernorm, bool scalar, std::string act); + bool position_bias, bool post_layernorm, float scale = 1.0f, ActType act_type = ActType::ActType_Gelu); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_post_layernorm(bool post_layernorm); @@ -54,8 +55,8 @@ class MIND_API EncoderLayer : public BaseOperator { void set_eps_layernorm2(float eps_layernorm2); void set_ffn_hidden_size(int64_t ffn_hidden_size); void set_position_bias(bool position_bias); - void set_scalar(bool scalar); - void set_act(std::string act); + void set_scale(float scale); + void set_act_type(ActType act_type); int64_t get_head_num() const; int64_t get_head_size() const; bool get_post_layernorm() const; @@ -63,8 +64,8 @@ class MIND_API EncoderLayer : public BaseOperator { float get_eps_layernorm2() const; int64_t get_ffn_hidden_size() const; bool get_position_bias() const; - bool get_scalar() const; - std::string get_act() const; + float get_scale() const; + ActType get_act_type() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h index 1f8d61e0e4e..a76770f8ae9 100644 --- a/mindspore/core/ops/op_name.h +++ b/mindspore/core/ops/op_name.h @@ -395,8 +395,8 @@ constexpr auto kDecoderLayerEpsLayerNorm2 = "eps_layernorm2"; constexpr auto kDecoderLayerEpsLayerNorm3 = "eps_layernorm3"; constexpr auto kDecoderLayerPositionBias1 = "position_bias1"; constexpr auto kDecoderLayerPositionBias2 = "position_bias2"; -constexpr auto kDecoderLayerScalar1 = "scalar"; -constexpr auto kDecoderLayerScalar2 = "scalar"; +constexpr auto kDecoderLayerScale1 = "scalar"; +constexpr auto kDecoderLayerScale2 = "scalar"; constexpr auto kPositionBias = "position_bias"; constexpr auto KExclusive = "exclusive"; constexpr auto KReverse = "reverse"; diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index bd6a2cf2eb3..3a24ac6fc44 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -232,8 +232,6 @@ union PrimitiveType { Log1p, TensorScatterAdd, ScatterElements, - EncoderLayer, - DecoderLayer, } table Abs { @@ -397,7 +395,7 @@ table Attention { head_num: long; head_size: long; cross: bool; - scalar: bool; + scale: float; } table Conv2DBackpropFilterFusion { diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc index 9557ffbf7df..2de89604156 100644 --- a/mindspore/lite/src/common/ops/ops_def.cc +++ b/mindspore/lite/src/common/ops/ops_def.cc @@ -395,7 +395,7 @@ OP_SCHEMA_DEF(Attention) OP_ATTR(head_num, long) OP_ATTR(head_size, long); OP_ATTR(cross, bool) -OP_ATTR(scalar, bool) +OP_ATTR(scale, float) OP_SCHEMA_DEF_END(Attention) OP_SCHEMA_DEF(Conv2DBackpropFilterFusion) diff --git a/mindspore/lite/src/common/ops/ops_func_declare.h b/mindspore/lite/src/common/ops/ops_func_declare.h index 2f9aac7456d..3b151b36caa 100644 --- a/mindspore/lite/src/common/ops/ops_func_declare.h +++ b/mindspore/lite/src/common/ops/ops_func_declare.h @@ -492,9 +492,6 @@ FUNC_MSOP2SCHEMAOP_DECLARE(GroupNormFusion) FUNC_MSOP2SCHEMAOP_DECLARE(Log1p) FUNC_MSOP2SCHEMAOP_DECLARE(TensorScatterAdd) FUNC_MSOP2SCHEMAOP_DECLARE(ScatterElements) -FUNC_MSOP2SCHEMAOP_DECLARE(EncoderLayer) -FUNC_MSOP2SCHEMAOP_DECLARE(DecoderLayer) - #endif } // namespace mindspore::lite::ops #else diff --git a/mindspore/lite/src/common/ops/ops_utils.cc b/mindspore/lite/src/common/ops/ops_utils.cc index 393bc8ef6ef..37d5f672a16 100644 --- a/mindspore/lite/src/common/ops/ops_utils.cc +++ b/mindspore/lite/src/common/ops/ops_utils.cc @@ -273,8 +273,6 @@ REG_MINDSPORE_OPERATOR(GroupNormFusion) REG_MINDSPORE_OPERATOR(Log1p) REG_MINDSPORE_OPERATOR(TensorScatterAdd) REG_MINDSPORE_OPERATOR(ScatterElements) -REG_MINDSPORE_OPERATOR(EncoderLayer) -REG_MINDSPORE_OPERATOR(DecoderLayer) } // namespace lite } // namespace mindspore diff --git a/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc b/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc deleted file mode 100644 index 13c580c7971..00000000000 --- a/mindspore/lite/src/common/ops/populate/decoder_layer_populate.cc +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Copyright 2022 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -// #include "src/common/ops/populate/populate_register.h" -// #include "nnacl/decoder_layer_parameter.h" - -// using mindspore::schema::PrimitiveType_DecoderLayer; - -// namespace mindspore { -// namespace lite { -// OpParameter *PopulateDecoderLayerParameter(const void *prim) { -// auto primitive = static_cast(prim); -// MS_CHECK_TRUE_RET(primitive != nullptr, nullptr); -// auto value = primitive->value_as_DecoderLayer(); -// MS_CHECK_TRUE_MSG(value != nullptr, nullptr, "value is nullptr."); -// auto *param = reinterpret_cast(malloc(sizeof(DecoderLayerParameter))); -// if (param == nullptr) { -// MS_LOG(ERROR) << "malloc DecoderLayerParameter failed."; -// return nullptr; -// } -// memset(param, 0, sizeof(DecoderLayerParameter)); -// param->op_parameter_.type_ = primitive->value_type(); -// param->head_num_ = value->head_num(); -// param->head_size_ = value->head_size(); -// param->post_layernorm_ = value->post_layernorm(); -// param->eps_layernorm1_ = value->eps_layernorm1(); -// param->eps_layernorm2_ = value->eps_layernorm2(); -// param->eps_layernorm3_ = value->eps_layernorm3(); -// param->position_bias1_ = value->position_bias1(); -// param->position_bias2_ = value->position_bias2(); -// param->scalar1 = value->scalar1(); -// param->scalar2 = value->scalar2(); -// // param->act = value->act()->c_str(); -// return reinterpret_cast(param); -// } - -// REG_POPULATE(PrimitiveType_DecoderLayer, PopulateDecoderLayerParameter, SCHEMA_CUR) -// } // namespace lite -// } // namespace mindspore diff --git a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc b/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc deleted file mode 100644 index 79b350bda8c..00000000000 --- a/mindspore/lite/src/common/ops/populate/encoder_layer_populate.cc +++ /dev/null @@ -1,47 +0,0 @@ -// /** -// * Copyright 2022 Huawei Technologies Co., Ltd -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// */ -// #include "src/common/ops/populate/populate_register.h" -// #include "nnacl/encoder_layer_parameter.h" - -// using mindspore::schema::PrimitiveType_EncoderLayer; - -// namespace mindspore { -// namespace lite { -// OpParameter *PopulateEncoderLayerParameter(const void *prim) { -// auto primitive = static_cast(prim); -// MS_CHECK_TRUE_RET(primitive != nullptr, nullptr); -// auto value = primitive->value_as_EncoderLayer(); -// MS_CHECK_TRUE_MSG(value != nullptr, nullptr, "value is nullptr."); -// auto *param = reinterpret_cast(malloc(sizeof(EncoderLayerParameter))); -// if (param == nullptr) { -// MS_LOG(ERROR) << "malloc EncoderLayerParameter failed."; -// return nullptr; -// } -// memset(param, 0, sizeof(EncoderLayerParameter)); -// param->op_parameter_.type_ = primitive->value_type(); -// param->head_num_ = value->head_num(); -// param->head_size_ = value->head_size(); -// param->layernorm_post_ = value->post_layernorm(); -// param->eps_layernorm1_ = value->eps_layernorm1(); -// param->eps_layernorm2_ = value->eps_layernorm2(); -// param->ffn_hidden_size_ = value->ffn_hidden_size(); -// param->position_bias_ = value->position_bias(); -// return reinterpret_cast(param); -// } - -// REG_POPULATE(PrimitiveType_EncoderLayer, PopulateEncoderLayerParameter, SCHEMA_CUR) -// } // namespace lite -// } // namespace mindspore diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index e00cc13cd4f..42ef50c5d25 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -108,8 +108,6 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.eps3 = decoder_op->get_eps_layernorm3(); params.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); params.ffn_fp16 = is_ffn_fp16_; - params.cublas_handle = GetCublasHandle(); - params.attn1.head_num = params.head_num; params.attn1.head_size = params.head_size; params.attn1.hidden_size = params.hidden_size; @@ -117,8 +115,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn1.qkv_bias = !params.attn1.position_bias; params.attn1.projection_bias = !params.attn1.position_bias; params.attn1.is_cross = false; - params.attn1.cublas_handle = GetCublasHandle(); - params.attn1.scalar = decoder_op->get_scalar1(); + params.attn1.scale = decoder_op->get_scale1(); params.attn1.mask = true; params.attn2.head_num = params.head_num; params.attn2.head_size = params.head_size; @@ -127,10 +124,9 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.qkv_bias = !params.attn2.position_bias; params.attn2.projection_bias = !params.attn2.position_bias; params.attn2.is_cross = true; - params.attn2.cublas_handle = GetCublasHandle(); - params.attn2.scalar = decoder_op->get_scalar2(); + params.attn2.scale = decoder_op->get_scale2(); params.attn2.mask = true; - params.is_act = decoder_op->get_act().c_str(); + params.act_type = decoder_op->get_act_type(); params.has_beta = !params.attn1.position_bias; params.has_bias = !params.attn1.position_bias; params.ffn_bias = !params.attn1.position_bias; @@ -193,11 +189,8 @@ template int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - params_.stream = stream; params_.algo = algoId; - params_.attn1.stream = stream; params_.attn1.algo = algoId; - params_.attn2.stream = stream; params_.attn2.algo = algoId; void *inputs_forward[num_of_inputs_]; for (int i = 0; i < num_of_inputs_; i++) { @@ -205,7 +198,7 @@ int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, - workspace); + workspace, GetCublasHandle(), stream); return RET_OK; } @@ -239,9 +232,9 @@ void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, size_t DecoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { if (compute_type_ == RuntimePrecisionMode_FP16) { - return fastertransformer::GetDecoderLayerWorkspaceSizeByOptAllocator(¶ms_); + return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); } else { - return fastertransformer::GetDecoderLayerWorkspaceSizeByOptAllocator(¶ms_); + return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); } } @@ -251,13 +244,8 @@ nvinfer1::DimsExprs DecoderPlugin::getOutputDimensions(int32_t index, const nvin if (index == 0) { int num_dims = inputs[0].nbDims; dims.nbDims = num_dims; - if (num_dims == INPUT_SIZE2) { - dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); - dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); - } else if (num_dims == INPUT_SIZE3) { - dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); - dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); - dims.d[2] = exprBuilder.constant(inputs[0].d[2]->getConstantValue()); + for(int i = 0; i < num_dims; i++ ) { + dims.d[i] = exprBuilder.constant(inputs[i].d[i]->getConstantValue()); } } return dims; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index c869cc0315f..a46848734ce 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -109,11 +109,9 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.eps2 = encoder_op->get_eps_layernorm2(); params.ffn_hidden_size = encoder_op->get_ffn_hidden_size(); params.ffn_fp16 = is_ffn_fp16_; - params.cublas_handle = GetCublasHandle(); params.hidden_size = params.head_num * params.head_size; params.attn.head_num = encoder_op->get_head_num(); params.attn.head_size = encoder_op->get_head_size(); - params.attn.cublas_handle = GetCublasHandle(); params.attn.hidden_size = params.head_num * params.head_size; params.attn.is_cross = false; params.attn.position_bias = encoder_op->get_position_bias(); @@ -123,8 +121,8 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.has_bias = !params.attn.position_bias; params.ffn_bias = !params.attn.position_bias; params.attn.mask = true; - params.is_act = encoder_op->get_act().c_str(); - params.attn.scalar = encoder_op->get_scalar(); + params.act_type = encoder_op->get_act_type(); + params.attn.scale = encoder_op->get_scale(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; @@ -189,9 +187,7 @@ template int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - params_.stream = stream; params_.algo = algoId; - params_.attn.stream = stream; params_.attn.algo = algoId; void *inputs_forward[num_of_inputs_]; for (int i = 0; i < num_of_inputs_; i++) { @@ -199,7 +195,7 @@ int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardEncoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, - workspace); + workspace, GetCublasHandle(), stream); return RET_OK; } @@ -226,14 +222,13 @@ void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, params_.attn.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; - if (num_of_inputs_ == C13NUM) params_.attn.mask = false; } size_t EncoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { if (compute_type_ == RuntimePrecisionMode_FP16) { - return fastertransformer::GetEncoderLayerWorkspaceSizeByOptAllocator(¶ms_); + return fastertransformer::GetEncoderLayerWorkspaceSize(¶ms_); } else { - return fastertransformer::GetEncoderLayerWorkspaceSizeByOptAllocator(¶ms_); + return fastertransformer::GetEncoderLayerWorkspaceSize(¶ms_); } } @@ -243,13 +238,8 @@ nvinfer1::DimsExprs EncoderPlugin::getOutputDimensions(int32_t index, const nvin if (index == 0) { int num_dims = inputs[0].nbDims; dims.nbDims = num_dims; - if (num_dims == INPUT_SIZE2) { - dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); - dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); - } else if (num_dims == INPUT_SIZE3) { - dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); - dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue()); - dims.d[kTwo] = exprBuilder.constant(inputs[0].d[kTwo]->getConstantValue()); + for(int i = 0; i < num_dims; i++ ) { + dims.d[i] = exprBuilder.constant(inputs[i].d[i]->getConstantValue()); } } return dims; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index 6379bdcfff3..a0fa6627247 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -76,7 +76,6 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { params.head_num = head_number; params.head_size = head_size; params.hidden_size = head_number * head_size; - params.cublas_handle = GetCublasHandle(); params.qkv_bias = !is_position_bias; params.projection_bias = !is_position_bias; params.is_cross = is_cross; @@ -99,37 +98,7 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { nvinfer1::ITensor *attn_tensor = mha_layer->getOutput(0); #ifndef TEST_ ctx->RegisterTensor(ITensorHelper{attn_tensor, Format::NCHW, true}, out_tensors_[0].Name()); -#else /* TEST_ */ - ctx->RegisterTensor(ITensorHelper{attn_tensor, Format::NCHW, true}, out_tensors_[0].Name() + "attn"); -#endif /* TEST_ */ this->layer_ = mha_layer; -#ifdef TEST_ - auto weight_projection = input(ctx, 4).trt_tensor_; - auto bias_projection = input(ctx, 6).trt_tensor_; -#endif /* TEST_ */ - -#ifdef TEST_ - auto matmul_layer = ctx->network()->addMatrixMultiply(*attn_tensor, nvinfer1::MatrixOperation::kNONE, - *weight_projection, nvinfer1::MatrixOperation::kNONE); - if (matmul_layer == nullptr) { - MS_LOG(ERROR) << "failed to add matmul layer"; - return RET_ERROR; - } - matmul_layer->setName((op_name_ + "_matmul").c_str()); - auto matmul_tensor = matmul_layer->getOutput(0); - auto shuffle_layer = ctx->network()->addShuffle(*bias_projection); - const auto size = bias_projection->getDimensions().d[0]; - shuffle_layer->setReshapeDimensions(nvinfer1::Dims{2, {1, size}}); - auto shuffle_tensor = shuffle_layer->getOutput(0); - auto addbias = ctx->network()->addElementWise(*matmul_tensor, *shuffle_tensor, nvinfer1::ElementWiseOperation::kSUM); - if (addbias == nullptr) { - MS_LOG(ERROR) << "failed to add bias layer"; - return RET_ERROR; - } - addbias->setName((op_name_ + "_bias").c_str()); - auto bias_out = addbias->getOutput(0); - ctx->RegisterTensor(ITensorHelper{bias_out, Format::NCHW, true}, out_tensors_[0].Name()); -#endif /* TEST_ */ return RET_OK; } @@ -161,7 +130,6 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi const int bias_qkv_tensor_idx = 5 + cross_tensor_offset; const int weight_qkv_tensor_idx = 3; const int position_bias_tensor_idx = 6 + cross_tensor_offset; - params_.stream = stream; params_.algo = algoId; void *inputs_attn[num_of_inputs_]; int index = 0; @@ -188,7 +156,7 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi } void *outputs_attn[] = {outputs[0]}; fastertransformer::forward_attn(reinterpret_cast(inputs_attn), num_of_inputs_, - reinterpret_cast(outputs_attn), num_of_outputs_, ¶ms_, workspace); + reinterpret_cast(outputs_attn), num_of_outputs_, ¶ms_, workspace, GetCublasHandle(), stream); return RET_OK; } @@ -232,7 +200,6 @@ nvinfer1::DimsExprs MhaPlugin::getOutputDimensions(int32_t index, const nvinfer1 nvinfer1::IExprBuilder &exprBuilder) noexcept { nvinfer1::DimsExprs dims; if (index == 0) { -#ifndef TEST_ int num_dims = inputs[0].nbDims; dims.nbDims = num_dims; if (num_dims == INPUT_SIZE2) { @@ -253,12 +220,6 @@ nvinfer1::DimsExprs MhaPlugin::getOutputDimensions(int32_t index, const nvinfer1 dims.d[kTwo] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; dims.d[kThree] = exprBuilder.constant(params_.head_size); } -#else - dims.nbDims = C2NUM; - dims.d[0] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - auto hidden_size = exprBuilder.constant(head_size_ * head_number_); - dims.d[1] = hidden_size; -#endif return dims; } diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h index 5454bf70984..d112051b9f9 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h @@ -217,7 +217,5 @@ void Data2Vector(std::vector *dst, const void *src) { } } -// nvinfer1::ITensor *castTensorFp32ToFp16(TensorRTContext *ctx, const TensorInfo &ms_tensor, -// const std::string &op_name); } // namespace mindspore::lite #endif // MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_TENSORRT_UTILS_H_ diff --git a/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.cc b/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.cc index 64cca0083ee..ae431f5d36d 100644 --- a/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.cc +++ b/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.cc @@ -826,47 +826,6 @@ void DebugDims(const std::string &key, const nvinfer1::Dims &dims) { } } -// nvinfer1::ITensor *castTensorFp32ToFp16(TensorRTContext *ctx, const TensorInfo &ms_tensor, -// const std::string &op_name) { -// if (ctx == nullptr || ctx->network() == nullptr) { -// MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor"; -// return nullptr; -// } -// nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape()); -// if (dims.nbDims == -1) { -// MS_LOG(INFO) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar."; -// dims.nbDims = 1; -// dims.d[0] = 1; -// } -// nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType()); -// if (!ms_tensor.IsConst()) { -// MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name(); -// return nullptr; -// } -// nvinfer1::Weights weights{data_type, ms_tensor.Data(), ms_tensor.ElementNum()}; -// if (data_type == nvinfer1::DataType::kFLOAT && is_ffn_fp16_) { -// void *data_float16 = malloc(ms_tensor.ElementNum() * sizeof(float)); -// if (data_float16 == nullptr) { -// MS_LOG(ERROR) << "Malloc buffer failed."; -// return nullptr; -// } -// auto src = static_cast(ms_tensor.Data()); -// auto dst = static_cast(data_float16); -// for (int i = 0; i < ms_tensor.ElementNum(); i++) { -// dst[i] = static_cast(src[i]); -// } -// weights.values = data_float16; -// } -// nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); -// if (constant_tensor == nullptr) { -// MS_LOG(ERROR) << "create constant_tensor failed."; -// return nullptr; -// } -// ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name); -// auto tensor_ptr = constant_tensor->getOutput(0); -// return tensor_ptr; -// } - template <> nvinfer1::DataType GetNvinferDataType() { return nvinfer1::DataType::kFLOAT; diff --git a/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.h b/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.h index d40bfd73da3..efaed1c5d54 100644 --- a/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.h +++ b/mindspore/lite/src/litert/delegate/tensorrt/tensorrt_utils.h @@ -187,45 +187,5 @@ void Data2Vector(std::vector *dst, const void *src) { dst->at(i) = static_cast(src_ptr[i]); } } -// nvinfer1::ITensor *castTensorFp32ToFp16(TensorRTContext *ctx, const TensorInfo &ms_tensor, -// const std::string &op_name) { -// if (ctx == nullptr || ctx->network() == nullptr) { -// MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor"; -// return nullptr; -// } -// nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape()); -// if (dims.nbDims == -1) { -// MS_LOG(INFO) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar."; -// dims.nbDims = 1; -// dims.d[0] = 1; -// } -// nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType()); -// if (!ms_tensor.IsConst()) { -// MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name(); -// return nullptr; -// } -// nvinfer1::Weights weights{data_type, ms_tensor.Data(), ms_tensor.ElementNum()}; -// if (data_type == nvinfer1::DataType::kFLOAT && is_ffn_fp16_) { -// void *data_float16 = malloc(ms_tensor.ElementNum() * sizeof(float)); -// if (data_float16 == nullptr) { -// MS_LOG(ERROR) << "Malloc buffer failed."; -// return nullptr; -// } -// auto src = static_cast(ms_tensor.Data()); -// auto dst = static_cast(data_float16); -// for (int i = 0; i < ms_tensor.ElementNum(); i++) { -// dst[i] = static_cast(src[i]); -// } -// weights.values = data_float16; -// } -// nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); -// if (constant_tensor == nullptr) { -// MS_LOG(ERROR) << "create constant_tensor failed."; -// return nullptr; -// } -// ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name); -// auto tensor_ptr = constant_tensor->getOutput(0); -// return tensor_ptr; -// } } // namespace mindspore::lite #endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_UTILS_H_ diff --git a/mindspore/lite/tools/optimizer/common/gllo_utils.cc b/mindspore/lite/tools/optimizer/common/gllo_utils.cc index 6beb1e41f0e..78662e859ce 100644 --- a/mindspore/lite/tools/optimizer/common/gllo_utils.cc +++ b/mindspore/lite/tools/optimizer/common/gllo_utils.cc @@ -354,9 +354,8 @@ std::vector CastToFloat(const ValuePtr &value) { if (data_type == kNumberTypeFloat || data_type == kNumberTypeFloat32) { cur_value.push_back(GetValue(value)); } else { - // MS_LOG(ERROR) << "the function only process float data."; - // return {}; - // } + MS_LOG(ERROR) << "the function only process float data."; + return {}; } } return cur_value; diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index c5ad6a75225..3774eb080de 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -348,7 +348,7 @@ STATUS DecoderLayerFusion::GetEps(const EquivPtr &equiv, VarPtr node_name, float STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1, - bool *is_position_bias2, bool *scalar1, bool *scalar2) const { + bool *is_position_bias2, float *scale1, float *scale2) const { auto attn_input = GetAttribute(func_graph, equiv, is_attention_); MS_ASSERT(attn_input != nullptr); auto attn_prim = ops::GetOperator(attn_input); @@ -361,8 +361,8 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { *is_position_bias1 = attn_prim->get_position_bias(); } - if (attn_prim->GetAttr(ops::kScalar) != nullptr) { - *scalar1 = attn_prim->get_scalar(); + if (attn_prim->GetAttr(ops::kScale) != nullptr) { + *scale1 = attn_prim->get_scale(); } if ((*equiv)[is_attention_] == nullptr || !utils::isa((*equiv)[is_attention_])) { MS_LOG(ERROR) << "is_attention_ is not AnfNodePtr"; @@ -374,8 +374,8 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (attn_cross_prim->GetAttr(ops::kPositionBias) != nullptr) { *is_position_bias2 = attn_cross_prim->get_position_bias(); } - if (attn_cross_prim->GetAttr(ops::kScalar) != nullptr) { - *scalar2 = attn_cross_prim->get_scalar(); + if (attn_cross_prim->GetAttr(ops::kScale) != nullptr) { + *scale2 = attn_cross_prim->get_scale(); } if (is_layernorm_fusion_) { auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); @@ -411,9 +411,9 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (!IsActGELU(func_graph, equiv)) { return RET_ERROR; } - act_ = "gelu"; + act_type_ = ActType::ActType_Gelu; } else { - act_ = "relu"; + act_type_ = ActType::ActType_Relu; } return RET_OK; } @@ -432,14 +432,14 @@ std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGrap float eps3 = 1e-6; bool is_position_bias1 = false; bool is_position_bias2 = false; - bool scalar1 = true; - bool scalar2 = true; + float scale1 = 1.0f; + float scale2 = 1.0f; if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1, - &is_position_bias2, &scalar1, &scalar2)) { + &is_position_bias2, &scale1, &scale2)) { return nullptr; } decoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3, ffn_hidden_size, is_position_bias1, is_position_bias2, - post_layernorm, scalar1, scalar2, act_); + post_layernorm, scale1, scale2, act_type_); return decoder_layer_prim; } diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h index c9b7a2273b0..7f2d481649a 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -55,7 +55,7 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { bool post_layernorm, int64_t ffn_hidden_size) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1, bool *is_position_bias2, - bool *scalar1, bool *scalar2) const; + float *scale1, float *scale2) const; AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const; lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; @@ -108,7 +108,7 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr eps3_{nullptr}; mutable bool is_position_bias_{false}; mutable bool is_layernorm_fusion_{false}; - mutable std::string act_{"gelu"}; + mutable ActType act_type_{ActType::ActType_No}; }; } // namespace opt } // namespace mindspore diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index b1331b2ca68..4ac343a3fc6 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -336,7 +336,7 @@ AnfNodePtr EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, cons } STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, - int *head_size, float *eps1, float *eps2, bool *scalar) const { + int *head_size, float *eps1, float *eps2, float *scale) const { auto attn_input = GetAttribute(func_graph, equiv, is_attention_); MS_ASSERT(attn_input != nullptr); auto attn_prim = ops::GetOperator(attn_input); @@ -349,8 +349,8 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) { is_position_bias_ = attn_prim->get_position_bias(); } - if (attn_prim->GetAttr(ops::kScalar) != nullptr) { - *scalar = attn_prim->get_scalar(); + if (attn_prim->GetAttr(ops::kScale) != nullptr) { + *scale = attn_prim->get_scale(); } if (is_layernorm_fusion_) { auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_); @@ -378,9 +378,9 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq if (!IsActGELU(func_graph, equiv, is_act_)) { return RET_ERROR; } - act_ = "gelu"; + act_type_ = ActType::ActType_Gelu; } else { - act_ = "relu"; + act_type_ = ActType::ActType_Relu; } return RET_OK; } @@ -396,12 +396,12 @@ std::shared_ptr EncoderLayerFusion::CreatePrim(const FuncGrap int head_size = 0; float eps1 = 1e-5; float eps2 = 1e-5; - bool scalar = true; - if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &scalar)) { + float scale = true; + if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &scale)) { return nullptr; } - encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm, scalar, - act_); + encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm, scale, + act_type_); return encoder_layer_prim; } diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index f388f16f35b..776151bf6e2 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -65,7 +65,7 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const VarPtr &input_prim) const; lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, - float *eps1, float *eps2, bool *scalar) const; + float *eps1, float *eps2, float *scale) const; std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const; @@ -91,7 +91,7 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr is_layernorm2_{nullptr}; mutable bool is_position_bias_{false}; mutable bool is_layernorm_fusion_{false}; - mutable std::string act_{"gelu"}; + mutable ActType act_type_{ActType::ActType_No}; mutable VarPtr is_act_{nullptr}; mutable VarPtr eps1_{nullptr}; mutable VarPtr eps2_{nullptr}; diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index 4eaacb23cc1..d9b2ed45a1c 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -613,6 +613,7 @@ bool MultiHeadAttentionFusion::CheckPattern(const EquivPtr &equiv, int *head_num } *head_num = out.at(0); *head_size = out.at(1); + scale_ = 1.0f / sqrtf(*head_size * 1.0f); return true; } @@ -628,7 +629,7 @@ AnfNodePtr MultiHeadAttentionFusion::Process(const std::string &pattern_name, co if (pattern_name == kMPAWithMaskPatternNameT5New || pattern_name == kMPAWithMaskTransposePatternNameT5New || pattern_name == kMPAWithMaskPatternNameT5New2) { t5_x_ = true; - scalar_ = (pattern_name == kMPAWithMaskPatternNameT5New2) ? false : true; + scale_ = (pattern_name == kMPAWithMaskPatternNameT5New2) ? 1.0f : scale_; } return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), true); } @@ -768,7 +769,7 @@ std::shared_ptr MultiHeadAttentionFusion::CreatePrim(const Equiv if (!CheckPattern(equiv, &head_num, &head_size)) { return nullptr; } - attention_prim->Init(head_num, head_size, t5_x_, cross, scalar_); + attention_prim->Init(head_num, head_size, t5_x_, cross, scale_); return attention_prim; } diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h index 5500da9f1f4..345616ed4ae 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h @@ -120,7 +120,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass { mutable VarPtr k_transpose_{nullptr}; mutable bool t5_x_{false}; - mutable bool scalar_{true}; + mutable float scale_{true}; }; } // namespace opt } // namespace mindspore diff --git a/trc/transformer/convert_fp32.sh b/trc/transformer/convert_fp32.sh index 15f6399f4f1..254e491d2bd 100755 --- a/trc/transformer/convert_fp32.sh +++ b/trc/transformer/convert_fp32.sh @@ -1,5 +1,4 @@ #!/bin/bash - base=`git rev-parse --show-toplevel` version=$(cat ${base}/version.txt) fusion=true @@ -11,15 +10,16 @@ while getopts "n" opt ; do echo "Unknown option ${opt}!" ;; esac done -if [ "${fusion}" = "true" ]; then - optimize="--optimizeTransformer=true" +if [ "${fusion}" = "false" ]; then + optimize="--optimizeTransformer=false" fi shift $(($OPTIND - 1)) file_name=$(basename $1) file_name="${file_name%.*}" -echo "${file_name%.*}" # dbg="gdb --args " - +if [ "${fusion}" = "true" ]; then + optimize="--optimizeTransformer=true" +fi #GLOG_v=0 \ lib_base=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64 LD_LIBRARY_PATH=${lib_base}/tools/converter/lib:${lib_base}/tools/converter/third_party/glog/lib \ diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index ade8a96268d..108f13a7bb8 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -6,10 +6,12 @@ system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linu benchmark=${system}/tools/benchmark/benchmark server=caspi gpu_id=5 -while getopts "c" opt ; do +while getopts "ct" opt ; do case "${opt}" in c) compress="_compress" ;; + t) + time=true ;; *) echo "Unknown option ${opt}!" ;; esac diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 2bfff7fdac9..bd9849fa686 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,22 +1,22 @@ -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #run the following tests before push -#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -#-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross -#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross -#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 -#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross +-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross # #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer @@ -44,13 +44,13 @@ #-b 16 -l 24 -H 16 -S 1024 -s 128 -P 0 -m bert #-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer # -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 @@ -86,7 +86,7 @@ # # only ch app: #-b 1 -l 12 -H 12 -S 768 -s 128 -m bert --b 8 -l 12 -H 4 -S 512 -s 64 -m bert +#-b 8 -l 12 -H 4 -S 512 -s 64 -m bert # ----------------------------------------------------------------------------- #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer @@ -149,7 +149,7 @@ # T5 tests --b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m T5 +#-b 1 -l 6 -s 512 -t 512 -H 8 -S 768 -f 2048 -p 0 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m T5 # #-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m T5 diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 10f7a97ad6a..3879072231f 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -355,6 +355,254 @@ def transformer_encoder_layer_create(): elif app=="trc": saveT(y, name + "_output1.fp" + suffix) +def transformer_encoder_layer_t5_create(): + name = "transformer_encoder_layer_t5" + if (post_layernorm): + print("post_layernorm") + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, post_layernorm_residual=True, has_bias=True, hidden_act='relu') + else: + model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, + num_heads=head_num, has_bias=True, hidden_act='relu') + encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + # encoder_input_value = M.Tensor(np.zeros((batch, seq, hid_size)), M.float32) + # encoder_input_mask = M.Tensor(np.zeros((batch, seq, seq)), M.float32) + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() + + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + wp = model.attention.projection.weight + omw = model.output.mapping.weight + opw = model.output.projection.weight + gl1 = model.layernorm1.weight + gl2 = model.layernorm2.weight + + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(encoder_input_value, name + "_input1.fp" + suffix) + saveT(encoder_input_mask, name + "_input2.fp" + suffix) + saveT(pos, name + "_input3.fp" + suffix) + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(wt, name + "_weight2.fp" + suffix) + saveT(wp, name + "_weight3.fp" + suffix) + saveT(gl2, name + "_weight4.fp" + suffix) + if ffn_fp16 == True: + saveTensorToHalf(omw, name + "_weight5.fp" + "16") + saveTensorToHalf(opw, name + "_weight6.fp" + "16") + else: + saveT(omw, name + "_weight5.fp" + suffix) + saveT(opw, name + "_weight6.fp" + suffix) + _cell_graph_executor.compile(model, + encoder_input_value, + encoder_input_mask,pos) + y = model(encoder_input_value, encoder_input_mask,pos) + print('name=',name) + export(model, encoder_input_value, encoder_input_mask,pos, file_name= name + "_fwd", file_format='MINDIR') + # if app=="ch": + f_y=open(f'./{name}_output.txt','w') + out_name='output1' + print("name output:",out_name) + saveCalib(out_name, np.array(y), f_y) + print("y.shape",np.array(y).shape) + # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims + f_y.close() + # elif app=="trc": + saveT(y, name + "_output1.fp" + suffix) + + +def transformer_decoder_layer_t5_create(): + name = "transformer_decoder_layer_t5" + if (post_layernorm): + print("post_layernorm true") + model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False, hidden_act="relu") + else: + print("post_layernorm false") + model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False, hidden_act="relu") + hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) + decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) + pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + encoder_pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + actual_seq = seq // 2 + if compress: + input_value = hidden_stats.asnumpy() + input_value[:,actual_seq:,:] = 0 + hidden_stats = M.Tensor.from_numpy(input_value) + decoder_input_mask_value = decoder_mask.asnumpy() + decoder_input_mask_value[:,:,actual_seq:] = 0 + decoder_mask = M.Tensor.from_numpy(decoder_input_mask_value) + encoder_output_value = encoder_output.asnumpy() + encoder_output_value[:,:,actual_seq:] = 0 + encoder_output = M.Tensor.from_numpy(encoder_output_value) + memory_mask_value = memory_mask.asnumpy() + memory_mask_value[:,:,actual_seq:] = 0 + memory_mask = M.Tensor.from_numpy(memory_mask_value) + pos_value = pos.asnumpy() + pos_value[:,:,actual_seq:] = 0 + pos = M.Tensor.from_numpy(pos_value) + encoder_pos_value = encoder_pos.asnumpy() + encoder_pos_value[:,:,actual_seq:] = 0 + encoder_pos = M.Tensor.from_numpy(encoder_pos_value) + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() + + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + wp = model.attention.projection.weight + + qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size + k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() + v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() + + w2 = np.concatenate((k2, v2)) # 2xhid_size x hid_size + w2 = w2.transpose() # hid_size x 2xhid_size + wt2 = M.Tensor(w2, w_compute_type) + wp2 = model.cross_attention.projection.weight + omw = model.output.mapping.weight + print('omw.asnumpy().shape',omw.asnumpy().shape) + opw = model.output.projection.weight + + gl1 = model.layernorm1.weight + gl2 = model.layernorm2.weight + gl3 = model.cross_attention_layernorm.weight + + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(wt, name + "_weight2.fp" + suffix) + saveT(wp, name + "_weight3.fp" + suffix) + saveT(gl2, name + "_weight4.fp" + suffix) + saveT(qt2, name + "_weight5.fp" + suffix) + saveT(wt2, name + "_weight6.fp" + suffix) + saveT(wp2, name + "_weight7.fp" + suffix) + saveT(gl3, name + "_weight8.fp" + suffix) + if(ffn_fp16): + saveTensorToHalf(omw, name + "_weight9.fp" + "16") + saveTensorToHalf(opw, name + "_weight10.fp" + "16") + else: + saveT(omw, name + "_weight9.fp" + suffix) + saveT(opw, name + "_weight10.fp" + suffix) + saveT(hidden_stats, name + "_input1.fp" + suffix) + saveT(decoder_mask, name + "_input2.fp" + suffix) + saveT(encoder_output, name + "_input3.fp" + suffix) + saveT(memory_mask, name + "_input4.fp" + suffix) + saveT(pos, name + "_input5.fp" + suffix) + saveT(encoder_pos, name + "_input6.fp" + suffix) + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , pos, encoder_pos) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos, file_name= name + "_fwd", file_format='MINDIR') + if compress: + y_num = y.asnumpy() + y_num[:,actual_seq:,:] = 0 + y = M.Tensor.from_numpy(y_num) + f_y=open(f'./{name}_output.txt','w') + saveCalib("output1", np.array(y), f_y)#2 dims + f_y.close() + saveT(y, name + "_output1.fp" + suffix) + +def transformer_decoder_layer_create(): + name = "transformer_decoder_layer" + if (post_layernorm): + print("post_layernorm true") + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True) + else: + print("post_layernorm false") + model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq, + tgt_seq_length=tgt_seq_len,num_heads=head_num) + hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32) + decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) + encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) + memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32) + q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size + k = model.attention.dense2.weight.asnumpy()#.transpose() + v = model.attention.dense3.weight.asnumpy()#.transpose() + + w = np.concatenate((q, k, v)) # 3xhid_size x hid_size + w = w.transpose() # hid_size x 3xhid_size + wt = M.Tensor(w, w_compute_type) + bq = model.attention.dense1.bias.asnumpy() + bk = model.attention.dense2.bias.asnumpy() + bv = model.attention.dense3.bias.asnumpy() + bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1 + bt =M.Tensor(bw, w_compute_type) + wp = model.attention.projection.weight + bp = model.attention.projection.bias + + qt2 = model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size + k2 = model.cross_attention.dense2.weight.asnumpy()#.transpose() + v2 = model.cross_attention.dense3.weight.asnumpy()#.transpose() + + w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size + w2 = w2.transpose() # hid_size x 3xhid_size + wt2 = M.Tensor(w2, w_compute_type) + bq2 = model.cross_attention.dense1.bias.asnumpy() + bk2 = model.cross_attention.dense2.bias.asnumpy() + bv2 = model.cross_attention.dense3.bias.asnumpy() + bw2 = np.concatenate((bq2, bk2, bv2)) #(3xhid) X 1 + bt2 =M.Tensor(bw2, w_compute_type) + wp2 = model.cross_attention.projection.weight + bp2 = model.cross_attention.projection.bias + omw = model.output.mapping.weight + opw = model.output.projection.weight + omb = model.output.mapping.bias + opb = model.output.projection.bias + + gl1 = model.layernorm1.gamma + bl1 = model.layernorm1.beta + gl2 = model.layernorm2.gamma + bl2 = model.layernorm2.beta + gl3 = model.cross_attention_layernorm.gamma + bl3 = model.cross_attention_layernorm.beta + suffix = str(compute_type) + suffix = suffix[-2:] + saveT(hidden_stats, name + "_input1.fp" + suffix) + saveT(decoder_mask, name + "_input2.fp" + suffix) + saveT(encoder_output, name + "_input3.fp" + suffix) + saveT(memory_mask, name + "_input4.fp" + suffix) + + saveT(gl1, name + "_weight1.fp" + suffix) + saveT(bl1, name + "_weight2.fp" + suffix) + saveT(wt, name + "_weight3.fp" + suffix) + saveT(bt, name + "_weight4.fp" + suffix) + saveT(wp, name + "_weight5.fp" + suffix) + saveT(bp, name + "_weight6.fp" + suffix) + saveT(gl2, name + "_weight7.fp" + suffix) + saveT(bl2, name + "_weight8.fp" + suffix) + saveT(qt2, name + "_weight9.fp" + suffix) + saveT(wt2, name + "_weight10.fp" + suffix) + saveT(bt2, name + "_weight11.fp" + suffix) + saveT(wp2, name + "_weight12.fp" + suffix) + saveT(bp2, name + "_weight13.fp" + suffix) + saveT(gl3, name + "_weight14.fp" + suffix) + saveT(bl3, name + "_weight15.fp" + suffix) + if(ffn_fp16): + saveTensorToHalf(omw, name + "_weight16.fp" + "16") + saveTensorToHalf(omb, name + "_weight17.fp" + "16") + saveTensorToHalf(opw, name + "_weight18.fp" + "16") + else: + saveT(omw, name + "_weight16.fp" + suffix) + saveT(omb, name + "_weight17.fp" + suffix) + saveT(opw, name + "_weight18.fp" + suffix) + saveT(opb, name + "_weight19.fp" + suffix) + _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask) + y = model(hidden_stats, decoder_mask, encoder_output, memory_mask) + export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR') + f_y=open(f'./{name}_output.txt','w') + saveCalib("output1", np.array(y), f_y)#2 dims + f_y.close() + saveT(y, name + "_output1.fp" + suffix) + def build_transformer_encoder_layer_post_ture(): model = TransformerEncoderLayer(batch_size=2, seq_length=16, @@ -713,8 +961,7 @@ def mha_T5_create(): compute_dtype=compute_type, param_init_type=w_compute_type, softmax_compute_type=s_compute_type, - has_bias=False, - app=app + has_bias=False ) print('compute_type',compute_type) q = model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size @@ -780,8 +1027,7 @@ def mha_T5_cross_create(): compute_dtype=compute_type, param_init_type=w_compute_type, softmax_compute_type=s_compute_type, - has_bias=False, - app=app + has_bias=False ) qt = model.dense1.weight -- Gitee From e02ac82ff6878613727a0d2a1b6a51f6cb08b29c Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Thu, 19 Jan 2023 15:22:21 +0200 Subject: [PATCH 29/39] fix --- mindspore/core/ops/decoder_layer.cc | 4 ++-- mindspore/core/ops/encoder_layer.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc index dbe996b8a4a..fd8d0f708a1 100644 --- a/mindspore/core/ops/decoder_layer.cc +++ b/mindspore/core/ops/decoder_layer.cc @@ -102,11 +102,11 @@ float DecoderLayer::get_scale2() const { return GetValue(value_ptr); } ActType DecoderLayer::get_act_type() const { - auto value_ptr = GetAttr(ActType); + auto value_ptr = GetAttr(kActivationType); if (value_ptr == nullptr) { return ActType::ActType_No; } - return ActivationType(GetValue(value_ptr)); + return ActType(GetValue(value_ptr)); } void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc index 6faec011525..2730d63dc71 100644 --- a/mindspore/core/ops/encoder_layer.cc +++ b/mindspore/core/ops/encoder_layer.cc @@ -83,11 +83,11 @@ float EncoderLayer::get_scale() const { return GetValue(value_ptr); } ActType EncoderLayer::get_act_type() const { - auto value_ptr = GetAttr(ActType); + auto value_ptr = GetAttr(kActivationType); if (value_ptr == nullptr) { return ActType::ActType_No; } - return ActivationType(GetValue(value_ptr)); + return ActType(GetValue(value_ptr)); } void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, bool position_bias, bool post_layernorm, float scale, -- Gitee From eeb923a9e6f25794bb6593ba2d2e2f0838189096 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Sun, 22 Jan 2023 15:20:29 +0200 Subject: [PATCH 30/39] fix --- mindspore/core/ops/decoder_layer.cc | 2 +- mindspore/core/ops/encoder_layer.cc | 2 +- mindspore/lite/src/common/prim_util.cc | 4 +- .../delegate/tensorrt/op/decoder_tensorrt.cc | 7 +-- .../delegate/tensorrt/op/decoder_tensorrt.h | 9 ++-- .../delegate/tensorrt/op/encoder_tensorrt.cc | 12 +++-- .../delegate/tensorrt/op/encoder_tensorrt.h | 5 +- .../delegate/tensorrt/op/mha_tensorrt.cc | 11 ++--- .../delegate/tensorrt/op/mha_tensorrt.h | 4 +- .../optimizer/fusion/decoder_layer_fusion.cc | 2 +- .../optimizer/fusion/encoder_layer_fusion.cc | 6 +-- trc/transformer/cfg_bert.config | 3 +- trc/transformer/deploy.sh | 2 +- trc/transformer/models.txt | 46 +++++++++++-------- trc/transformer/train_transformer_export.py | 4 +- 15 files changed, 66 insertions(+), 53 deletions(-) diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc index fd8d0f708a1..91d725c5b15 100644 --- a/mindspore/core/ops/decoder_layer.cc +++ b/mindspore/core/ops/decoder_layer.cc @@ -102,7 +102,7 @@ float DecoderLayer::get_scale2() const { return GetValue(value_ptr); } ActType DecoderLayer::get_act_type() const { - auto value_ptr = GetAttr(kActivationType); + auto value_ptr = GetAttr(kActivationType); if (value_ptr == nullptr) { return ActType::ActType_No; } diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc index 2730d63dc71..276d10de223 100644 --- a/mindspore/core/ops/encoder_layer.cc +++ b/mindspore/core/ops/encoder_layer.cc @@ -83,7 +83,7 @@ float EncoderLayer::get_scale() const { return GetValue(value_ptr); } ActType EncoderLayer::get_act_type() const { - auto value_ptr = GetAttr(kActivationType); + auto value_ptr = GetAttr(kActivationType); if (value_ptr == nullptr) { return ActType::ActType_No; } diff --git a/mindspore/lite/src/common/prim_util.cc b/mindspore/lite/src/common/prim_util.cc index b7276233b76..c5be3ff5714 100644 --- a/mindspore/lite/src/common/prim_util.cc +++ b/mindspore/lite/src/common/prim_util.cc @@ -28,9 +28,9 @@ static std::set kTensorListOps = { schema::PrimitiveType_TensorListReserve, schema::PrimitiveType_TensorListSetItem, schema::PrimitiveType_TensorListStack}; -static const char *const kInnerOpNames[6] = { +static const char *const kInnerOpNames[8] = { "Inner_ToFormat", "Inner_GltextureToOpencl", "Inner_Identity", - "Inner_ShapeFusion", "Inner_GraphKernel", "Inner_SplitReduceConcatFusion", + "Inner_ShapeFusion", "Inner_GraphKernel", "Inner_SplitReduceConcatFusion", "Inner_EncoderLayer" ,"Inner_DecoderLayer", }; int GetPrimitiveType(const void *primitive, int schema_version) { if (primitive == nullptr) { diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 42ef50c5d25..b93d05bfcc4 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -126,7 +126,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.is_cross = true; params.attn2.scale = decoder_op->get_scale2(); params.attn2.mask = true; - params.act_type = decoder_op->get_act_type(); + params.act_type = (fastertransformer::ActType)(decoder_op->get_act_type()); params.has_beta = !params.attn1.position_bias; params.has_bias = !params.attn1.position_bias; params.ffn_bias = !params.attn1.position_bias; @@ -149,7 +149,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + std::make_shared(input_tensor->getName(), compute_type, params, GetCublasHandle(), GetCublasLtHandle(), device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -185,6 +185,7 @@ int DecoderPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nv CUBLAS_GEMM_DEFAULT_TENSOR_OP); } } + template int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, @@ -198,7 +199,7 @@ int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, - workspace, GetCublasHandle(), stream); + workspace, cublas_handle_, stream); return RET_OK; } diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index 2bf6cc645fd..19b90242d81 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -23,8 +23,6 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" #include "src/fastertransformer/layers/ms_layers/decoder.h" -#include "src/fastertransformer/layers/ms_layers/param.h" -#include "src/extendrt/delegate/tensorrt/tensorrt_utils.h" namespace mindspore::lite { class DecoderTensorRT : public TensorRTOp { @@ -48,12 +46,13 @@ class DecoderTensorRT : public TensorRTOp { constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"}; class DecoderPlugin : public TensorRTPlugin { public: - DecoderPlugin(const std::string name, int compute_type, fastertransformer::decoderParamT params, + DecoderPlugin(const std::string name, int compute_type, fastertransformer::decoderParamT params, cublasHandle_t cublas_handle, cublasLtHandle_t cublaslt_handle, uint32_t device_id) : TensorRTPlugin(name, std::string(DECODER_PLUGIN_NAME), device_id), compute_type_(compute_type), params_(params), - cublaslt_handle_(cublaslt_handle) {} + cublas_handle_(cublas_handle), + cublaslt_handle_(cublaslt_handle){} DecoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) { @@ -90,10 +89,10 @@ class DecoderPlugin : public TensorRTPlugin { std::string name_space_; int compute_type_; mutable fastertransformer::decoderParamT params_; + cublasHandle_t cublas_handle_; cublasLtHandle_t cublaslt_handle_; int num_of_inputs_; int num_of_outputs_; - template int RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index a46848734ce..62216a386aa 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -30,6 +30,7 @@ #include "src/fastertransformer/utils/cuda_utils.h" #include "src/fastertransformer/utils/allocator.h" #include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h" namespace mindspore::lite { namespace { @@ -38,6 +39,7 @@ constexpr std::size_t kTwo = 2; int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { + std::cout << "Unsupported input tensor size, size is " << in_tensors.size(); if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM && in_tensors.size() != C13NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; @@ -91,6 +93,8 @@ nvinfer1::ITensor *EncoderTensorRT::castTensor(TensorRTContext *ctx, const Tenso } int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { + std::cout << "AddInnerOp\n"; + if (ctx == nullptr || ctx->network() == nullptr) { MS_LOG(ERROR) << "context or network is invalid"; return RET_ERROR; @@ -121,7 +125,7 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.has_bias = !params.attn.position_bias; params.ffn_bias = !params.attn.position_bias; params.attn.mask = true; - params.act_type = encoder_op->get_act_type(); + params.act_type = (fastertransformer::ActType)(encoder_op->get_act_type()); params.attn.scale = encoder_op->get_scale(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { @@ -146,7 +150,7 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + std::make_shared(input_tensor->getName(), compute_type, params, GetCublasHandle(), GetCublasLtHandle(), device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -195,7 +199,7 @@ int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardEncoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, - workspace, GetCublasHandle(), stream); + workspace, cublas_handle_, stream); return RET_OK; } @@ -239,7 +243,7 @@ nvinfer1::DimsExprs EncoderPlugin::getOutputDimensions(int32_t index, const nvin int num_dims = inputs[0].nbDims; dims.nbDims = num_dims; for(int i = 0; i < num_dims; i++ ) { - dims.d[i] = exprBuilder.constant(inputs[i].d[i]->getConstantValue()); + dims.d[i] = exprBuilder.constant(inputs[index].d[i]->getConstantValue()); } } return dims; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index 4e593b2bf02..acecd1d38b6 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -46,11 +46,12 @@ class EncoderTensorRT : public TensorRTOp { constexpr auto ENCODER_PLUGIN_NAME{"EncoderPlugin"}; class EncoderPlugin : public TensorRTPlugin { public: - EncoderPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params, + EncoderPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params, cublasHandle_t cublas_handle, cublasLtHandle_t cublaslt_handle, uint32_t device_id) : TensorRTPlugin(name, std::string(ENCODER_PLUGIN_NAME), device_id), compute_type_(compute_type), params_(params), + cublas_handle_(cublas_handle), cublaslt_handle_(cublaslt_handle) {} EncoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) @@ -58,7 +59,6 @@ class EncoderPlugin : public TensorRTPlugin { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; params_ = static_cast(fields[1].data)[0]; - cublaslt_handle_ = static_cast(fields[2].data)[0]; } EncoderPlugin(const char *name, const void *serialData, size_t serialLength) @@ -89,6 +89,7 @@ class EncoderPlugin : public TensorRTPlugin { std::string name_space_; int compute_type_; mutable fastertransformer::encoderParamT params_; + cublasHandle_t cublas_handle_; cublasLtHandle_t cublaslt_handle_; int num_of_inputs_; int num_of_outputs_; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index a0fa6627247..d5acb62809b 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -80,10 +80,10 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { params.projection_bias = !is_position_bias; params.is_cross = is_cross; params.position_bias = is_position_bias; - params.scalar = mha_op->get_scalar(); + params.scale = mha_op->get_scale(); params.mask = true; auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + std::make_shared(input_tensor->getName(), compute_type, params, GetCublasHandle(), GetCublasLtHandle(), device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -96,7 +96,6 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { } mha_layer->setName((op_name_ + "plugin_attention").c_str()); nvinfer1::ITensor *attn_tensor = mha_layer->getOutput(0); -#ifndef TEST_ ctx->RegisterTensor(ITensorHelper{attn_tensor, Format::NCHW, true}, out_tensors_[0].Name()); this->layer_ = mha_layer; return RET_OK; @@ -156,7 +155,7 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi } void *outputs_attn[] = {outputs[0]}; fastertransformer::forward_attn(reinterpret_cast(inputs_attn), num_of_inputs_, - reinterpret_cast(outputs_attn), num_of_outputs_, ¶ms_, workspace, GetCublasHandle(), stream); + reinterpret_cast(outputs_attn), num_of_outputs_, ¶ms_, workspace, cublas_handle_, stream); return RET_OK; } @@ -190,9 +189,9 @@ void MhaPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int size_t MhaPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { if (compute_type_ == RuntimePrecisionMode_FP16) { - return fastertransformer::GetAttnWorkspaceSizeByOptAllocator(¶ms_); + return fastertransformer::GetAttnWorkspaceSize(¶ms_); } else { - return fastertransformer::GetAttnWorkspaceSizeByOptAllocator(¶ms_); + return fastertransformer::GetAttnWorkspaceSize(¶ms_); } } diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h index 5cca33cc03a..ada87c8cb91 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h @@ -44,11 +44,12 @@ class MhaTensorRT : public TensorRTOp { constexpr auto MHA_PLUGIN_NAME{"AttentionPlugin"}; class MhaPlugin : public TensorRTPlugin { public: - MhaPlugin(const std::string name, int compute_type, fastertransformer::attentionParamT params, + MhaPlugin(const std::string name, int compute_type, fastertransformer::attentionParamT params, cublasHandle_t cublas_handle, cublasLtHandle_t cublaslt_handle, uint32_t device_id) : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id), compute_type_(compute_type), params_(params), + cublas_handle_(cublas_handle), cublaslt_handle_(cublaslt_handle) {} MhaPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) @@ -93,6 +94,7 @@ class MhaPlugin : public TensorRTPlugin { std::string name_space_; int compute_type_; mutable fastertransformer::attentionParamT params_; + cublasHandle_t cublas_handle_; cublasLtHandle_t cublaslt_handle_; int num_of_inputs_; int num_of_outputs_; diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc index 3774eb080de..9c123bb2f7f 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc @@ -352,7 +352,7 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq auto attn_input = GetAttribute(func_graph, equiv, is_attention_); MS_ASSERT(attn_input != nullptr); auto attn_prim = ops::GetOperator(attn_input); - if (attn_prim->GetAttr(ops::kEncoderLayerNumHeads) != nullptr) { + if (attn_prim->GetAttr(ops::kDecoderLayerNumHeads) != nullptr) { *head_num = attn_prim->get_head_num(); } if (attn_prim->GetAttr(ops::kAttentionSizePerHead) != nullptr) { diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index 4ac343a3fc6..56121570ce2 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -157,9 +157,9 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_, position_bias_}); } + // return attention; if (mask) inputs.push_back(mask_); auto attention = VectorRef(inputs); - // return attention; if (!is_position_bias) { auto is_tuple = std::make_shared(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme"); auto var_tuple = std::make_shared("var_tuple"); @@ -245,8 +245,8 @@ std::unordered_map EncoderLayerFusion::DefinePatterns() patterns[kPatternTEncoderLayerPost] = DefinePatternEncoderLayer(true); patterns[kPatternTEncoderLayerPostNorm] = DefinePatternEncoderLayer(true, true); patterns[kPatternTEncoderLayerPreNorm] = DefinePatternEncoderLayer(false, true); - patterns[kPatternEncoderLayerT5Pre] = DefinePatternEncoderLayer(false, false, true); - patterns[kPatternEncoderLayerT5Post] = DefinePatternEncoderLayer(true, false, true); + patterns[kPatternEncoderLayerT5Pre] = DefinePatternEncoderLayer(false, false, true, true); + patterns[kPatternEncoderLayerT5Post] = DefinePatternEncoderLayer(true, false, true, true); patterns[kPatternTEncoderLayerWhitoutMaskPostNorm] = DefinePatternEncoderLayer(true, true, false, false); patterns[kPatternTEncoderLayerWhitoutMaskPreNorm] = DefinePatternEncoderLayer(false, true, false, false); return patterns; diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 2f318d6c2a4..370f5e1aba9 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,3 +1,2 @@ [gpu_context] -input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128] - +input_shape=input_ids:[transformer_encoder_layer_t5,128];token_type_ids:[transformer_encoder_layer_t5,128];input_mask:[transformer_encoder_layer_t5,128] diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 108f13a7bb8..75c998d76bb 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -60,7 +60,7 @@ echo ${input_files} command="cd ${PWD} && " command+="LD_LIBRARY_PATH=${system}/runtime/lib:${system}/tools/converter/lib CUDA_VISIBLE_DEVICES=${gpu_id} " # command+=" NVIDIA_TF32_OVERRIDE=0 " -command+="${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " +command+="gdb --args ${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " if [ "${time}" == "" ] then command+="--inDataFile=\"${input_files}\"" diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index bd9849fa686..2f219466bb0 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,23 +1,23 @@ --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #run the following tests before push --b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 --b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross --b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross --b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 --b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross -# +#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +#-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross + #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer @@ -44,13 +44,13 @@ #-b 16 -l 24 -H 16 -S 1024 -s 128 -P 0 -m bert #-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer # --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 @@ -60,9 +60,17 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer #-b 1 -l 12 -H 4 -S 512 -s 128 -f 3072 -P 1 -m transformer_encoder_layer -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer -#-b 4 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer -#-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer +#-b 4 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_encoder_layer +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_encoder_layer +-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5 +#-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_encoder_layer_t5 +# +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_decoder_layer +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_decoder_layer_t5 +#-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer_t5 +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer #-b 1 -l 12 -H 4 -S 512 -s 128 -P 1 -f 3072 -m transformer_encoder_layer #-b 4 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 3879072231f..18a75f1a6e0 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -360,10 +360,10 @@ def transformer_encoder_layer_t5_create(): if (post_layernorm): print("post_layernorm") model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, - num_heads=head_num, post_layernorm_residual=True, has_bias=True, hidden_act='relu') + num_heads=head_num, post_layernorm_residual=True, has_bias=False, hidden_act='relu') else: model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq, - num_heads=head_num, has_bias=True, hidden_act='relu') + num_heads=head_num, has_bias=False, hidden_act='relu') encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) -- Gitee From 07473d539fd2a78d9ab8c24c3abab817602533a9 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Sun, 22 Jan 2023 15:23:46 +0200 Subject: [PATCH 31/39] order the params --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 88 ++++++++----------- .../delegate/tensorrt/op/decoder_tensorrt.h | 14 ++- .../delegate/tensorrt/op/encoder_tensorrt.cc | 65 +++++++------- .../delegate/tensorrt/op/encoder_tensorrt.h | 15 ++-- .../delegate/tensorrt/op/mha_tensorrt.cc | 60 +++++++------ .../delegate/tensorrt/op/mha_tensorrt.h | 11 ++- trc/transformer/cfg_bert.config | 3 +- trc/transformer/ftBench.py | 25 +++--- trc/transformer/models.txt | 38 ++++---- 9 files changed, 148 insertions(+), 171 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 42ef50c5d25..f2224f835de 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -97,43 +97,38 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { MS_LOG(ERROR) << "op action convert failed"; return RET_ERROR; } - fastertransformer::decoderParamT params; + fastertransformer::decoderParamRun params; memset_s(¶ms, sizeof(params), 0, sizeof(params)); - params.head_num = decoder_op->get_head_num(); - params.head_size = decoder_op->get_head_size(); - params.hidden_size = params.head_num * params.head_size; - params.layernorm_post = decoder_op->get_post_layernorm(); - params.eps1 = decoder_op->get_eps_layernorm1(); - params.eps2 = decoder_op->get_eps_layernorm2(); - params.eps3 = decoder_op->get_eps_layernorm3(); - params.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); - params.ffn_fp16 = is_ffn_fp16_; - params.attn1.head_num = params.head_num; - params.attn1.head_size = params.head_size; - params.attn1.hidden_size = params.hidden_size; - params.attn1.position_bias = decoder_op->get_position_bias1(); - params.attn1.qkv_bias = !params.attn1.position_bias; - params.attn1.projection_bias = !params.attn1.position_bias; - params.attn1.is_cross = false; - params.attn1.scale = decoder_op->get_scale1(); - params.attn1.mask = true; - params.attn2.head_num = params.head_num; - params.attn2.head_size = params.head_size; - params.attn2.hidden_size = params.hidden_size; - params.attn2.position_bias = decoder_op->get_position_bias2(); - params.attn2.qkv_bias = !params.attn2.position_bias; - params.attn2.projection_bias = !params.attn2.position_bias; - params.attn2.is_cross = true; - params.attn2.scale = decoder_op->get_scale2(); - params.attn2.mask = true; - params.act_type = decoder_op->get_act_type(); - params.has_beta = !params.attn1.position_bias; - params.has_bias = !params.attn1.position_bias; - params.ffn_bias = !params.attn1.position_bias; + cublasLtHandle_t cublaslt_handle = GetCublasLtHandle(); + params.cublas_handle =&(cublaslt_handle); + params.common_param->head_num = decoder_op->get_head_num(); + params.common_param->head_size = decoder_op->get_head_size(); + params.common_param->hidden_size = params.common_param->head_num * params.common_param->head_size; + params.decoder->layernorm_post = decoder_op->get_post_layernorm(); + params.decoder->eps1 = decoder_op->get_eps_layernorm1(); + params.decoder->eps2 = decoder_op->get_eps_layernorm2(); + params.decoder->eps3 = decoder_op->get_eps_layernorm3(); + params.ffn_param->ffn_hidden_size = decoder_op->get_ffn_hidden_size(); + params.ffn_param->ffn_fp16 = is_ffn_fp16_; + params.attn1->position_bias = decoder_op->get_position_bias1(); + params.attn1->qkv_bias = !params.attn1->position_bias; + params.attn1->projection_bias = !params.attn1->position_bias; + params.attn1->is_cross = false; + params.attn1->scale = decoder_op->get_scale1(); + params.attn1->mask = true; + params.attn2->position_bias = decoder_op->get_position_bias2(); + params.attn2->qkv_bias = !params.attn2->position_bias; + params.attn2->projection_bias = !params.attn2->position_bias; + params.attn2->is_cross = true; + params.attn2->scale = decoder_op->get_scale2(); + params.attn2->mask = true; + params.ffn_param->act_type = decoder_op->get_act_type(); + params.decoder->has_beta = !params.attn1->position_bias; + params.ffn_param->ffn_bias = !params.attn1->position_bias; auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { - size_t start_fp16 = (params.attn1.position_bias) ? C13NUM : C18NUM; - size_t end_fp16 = (params.attn1.position_bias) ? C16NUM : C22NUM; + size_t start_fp16 = (params.attn1->position_bias) ? C13NUM : C18NUM; + size_t end_fp16 = (params.attn1->position_bias) ? C16NUM : C22NUM; for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { @@ -149,7 +144,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + std::make_shared(input_tensor->getName(), compute_type, params, device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -189,16 +184,15 @@ template int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - params_.algo = algoId; - params_.attn1.algo = algoId; - params_.attn2.algo = algoId; + params_.common_param->algo = algoId; + params_.stream = &stream; void *inputs_forward[num_of_inputs_]; for (int i = 0; i < num_of_inputs_; i++) { inputs_forward[i] = const_cast(inputs[i]); } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardDecoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, - workspace, GetCublasHandle(), stream); + workspace); return RET_OK; } @@ -217,15 +211,9 @@ void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, const int request_batch_size = static_cast(in[0].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); const int request_tgt_seq_len = request_src_seq_len; - params_.batch_size = request_batch_size; - params_.src_seq_len = request_src_seq_len; - params_.tgt_seq_len = request_tgt_seq_len; - params_.attn1.batch_size = request_batch_size; - params_.attn1.src_seq_len = request_src_seq_len; - params_.attn1.tgt_seq_len = request_tgt_seq_len; - params_.attn2.batch_size = request_batch_size; - params_.attn2.src_seq_len = request_src_seq_len; - params_.attn2.tgt_seq_len = request_tgt_seq_len; + params_.common_param->batch_size = request_batch_size; + params_.common_param->src_seq_len = request_src_seq_len; + params_.common_param->tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } @@ -262,12 +250,12 @@ nvinfer1::IPluginV2DynamicExt *DecoderPlugin::clone() const noexcept { } size_t DecoderPlugin::getSerializationSize() const noexcept { - return sizeof(int) + sizeof(fastertransformer::decoderParamT); + return sizeof(int) + sizeof(fastertransformer::decoderParamRun); } void DecoderPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); - SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::decoderParamT)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::decoderParamRun)); } REGISTER_TENSORRT_CREATOR(ops::kNameDecoderLayer, DecoderTensorRT) } // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index 2bf6cc645fd..3b5df5e14af 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -48,24 +48,23 @@ class DecoderTensorRT : public TensorRTOp { constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"}; class DecoderPlugin : public TensorRTPlugin { public: - DecoderPlugin(const std::string name, int compute_type, fastertransformer::decoderParamT params, - cublasLtHandle_t cublaslt_handle, uint32_t device_id) + DecoderPlugin(const std::string name, int compute_type, fastertransformer::decoderParamRun params, + uint32_t device_id) : TensorRTPlugin(name, std::string(DECODER_PLUGIN_NAME), device_id), compute_type_(compute_type), - params_(params), - cublaslt_handle_(cublaslt_handle) {} + params_(params){} DecoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; - params_ = static_cast(fields[1].data)[0]; + params_ = static_cast(fields[1].data)[0]; } DecoderPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::decoderParamT)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::decoderParamRun)); } DecoderPlugin() = delete; @@ -89,8 +88,7 @@ class DecoderPlugin : public TensorRTPlugin { private: std::string name_space_; int compute_type_; - mutable fastertransformer::decoderParamT params_; - cublasLtHandle_t cublaslt_handle_; + mutable fastertransformer::decoderParamRun params_; int num_of_inputs_; int num_of_outputs_; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index a46848734ce..edab2a47200 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -100,34 +100,32 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { MS_LOG(ERROR) << "op action convert failed"; return RET_ERROR; } - fastertransformer::encoderParamT params; + fastertransformer::encoderParamRun params; memset_s(¶ms, sizeof(params), 0, sizeof(params)); - params.head_num = encoder_op->get_head_num(); - params.head_size = encoder_op->get_head_size(); - params.layernorm_post = encoder_op->get_post_layernorm(); - params.eps1 = encoder_op->get_eps_layernorm1(); - params.eps2 = encoder_op->get_eps_layernorm2(); - params.ffn_hidden_size = encoder_op->get_ffn_hidden_size(); - params.ffn_fp16 = is_ffn_fp16_; - params.hidden_size = params.head_num * params.head_size; - params.attn.head_num = encoder_op->get_head_num(); - params.attn.head_size = encoder_op->get_head_size(); - params.attn.hidden_size = params.head_num * params.head_size; - params.attn.is_cross = false; - params.attn.position_bias = encoder_op->get_position_bias(); - params.attn.projection_bias = !params.attn.position_bias; - params.attn.qkv_bias = !params.attn.position_bias; - params.has_beta = !params.attn.position_bias; - params.has_bias = !params.attn.position_bias; - params.ffn_bias = !params.attn.position_bias; - params.attn.mask = true; + cublasLtHandle_t cublaslt_handle_ = GetCublasLtHandle(); + params.cublas_handle =&(cublaslt_handle_); + params.common_param->head_num = encoder_op->get_head_num(); + params.common_param->head_size = encoder_op->get_head_size(); + params.encoder->layernorm_post = encoder_op->get_post_layernorm(); + params.encoder->eps1 = encoder_op->get_eps_layernorm1(); + params.encoder->eps2 = encoder_op->get_eps_layernorm2(); + params.ffn_param->ffn_hidden_size = encoder_op->get_ffn_hidden_size(); + params.ffn_param->ffn_fp16 = is_ffn_fp16_; + params.common_param->hidden_size = params.common_param->head_num * params.common_param->head_size; + params.attn->is_cross = false; + params.attn->position_bias = encoder_op->get_position_bias(); + params.attn->projection_bias = !params.attn->position_bias; + params.attn->qkv_bias = !params.attn->position_bias; + params.encoder->has_beta = !params.attn->position_bias; + params.ffn_param->ffn_bias = !params.attn->position_bias; + params.attn->mask = true; params.act_type = encoder_op->get_act_type(); params.attn.scale = encoder_op->get_scale(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { - size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM; - size_t end_fp16 = (params.layernorm_post) ? C11NUM : C13NUM; - if (params.attn.position_bias) { + size_t start_fp16 = (params.encoder->layernorm_post) ? C7NUM : C9NUM; + size_t end_fp16 = (params.encoder->layernorm_post) ? C11NUM : C13NUM; + if (params.attn->position_bias) { start_fp16 = C6NUM; end_fp16 = C9NUM; } @@ -146,7 +144,7 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + std::make_shared(input_tensor->getName(), compute_type, params, device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -187,15 +185,15 @@ template int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - params_.algo = algoId; - params_.attn.algo = algoId; + params_.common_param->algo = algoId; + params_.stream = stream; void *inputs_forward[num_of_inputs_]; for (int i = 0; i < num_of_inputs_; i++) { inputs_forward[i] = const_cast(inputs[i]); } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardEncoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, - workspace, GetCublasHandle(), stream); + workspace); return RET_OK; } @@ -214,12 +212,9 @@ void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, const int request_batch_size = static_cast(in[0].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); const int request_tgt_seq_len = request_src_seq_len; - params_.batch_size = request_batch_size; - params_.src_seq_len = request_src_seq_len; - params_.tgt_seq_len = request_tgt_seq_len; - params_.attn.batch_size = request_batch_size; - params_.attn.src_seq_len = request_src_seq_len; - params_.attn.tgt_seq_len = request_tgt_seq_len; + params_.common_param->batch_size = request_batch_size; + params_.common_param->src_seq_len = request_src_seq_len; + params_.common_param->tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } @@ -256,12 +251,12 @@ nvinfer1::IPluginV2DynamicExt *EncoderPlugin::clone() const noexcept { } size_t EncoderPlugin::getSerializationSize() const noexcept { - return sizeof(int) + sizeof(fastertransformer::encoderParamT); + return sizeof(int) + sizeof(fastertransformer::encoderParamRun); } void EncoderPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); - SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::encoderParamT)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::encoderParamRun)); } REGISTER_TENSORRT_CREATOR(ops::kNameEncoderLayer, EncoderTensorRT) } // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index 4e593b2bf02..9b9b58f285e 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -46,25 +46,23 @@ class EncoderTensorRT : public TensorRTOp { constexpr auto ENCODER_PLUGIN_NAME{"EncoderPlugin"}; class EncoderPlugin : public TensorRTPlugin { public: - EncoderPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params, - cublasLtHandle_t cublaslt_handle, uint32_t device_id) + EncoderPlugin(const std::string name, int compute_type, fastertransformer::encoderParamRun params, + uint32_t device_id) : TensorRTPlugin(name, std::string(ENCODER_PLUGIN_NAME), device_id), compute_type_(compute_type), - params_(params), - cublaslt_handle_(cublaslt_handle) {} + params_(params) {} EncoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; - params_ = static_cast(fields[1].data)[0]; - cublaslt_handle_ = static_cast(fields[2].data)[0]; + params_ = static_cast(fields[1].data)[0]; } EncoderPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamT)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamRun)); } EncoderPlugin() = delete; @@ -88,8 +86,7 @@ class EncoderPlugin : public TensorRTPlugin { private: std::string name_space_; int compute_type_; - mutable fastertransformer::encoderParamT params_; - cublasLtHandle_t cublaslt_handle_; + mutable fastertransformer::encoderParamRun params_; int num_of_inputs_; int num_of_outputs_; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index a0fa6627247..2eefa9f4f64 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -71,19 +71,21 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { bool is_cross = mha_op->get_cross(); bool is_position_bias = mha_op->get_position_bias(); nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; - fastertransformer::attentionParamT params; + fastertransformer::attentionParamRun params; memset_s(¶ms, sizeof(params), 0, sizeof(params)); - params.head_num = head_number; - params.head_size = head_size; - params.hidden_size = head_number * head_size; - params.qkv_bias = !is_position_bias; - params.projection_bias = !is_position_bias; - params.is_cross = is_cross; - params.position_bias = is_position_bias; - params.scalar = mha_op->get_scalar(); - params.mask = true; + cublasLtHandle_t cublaslt_handle_ = GetCublasLtHandle(); + params.cublas_handle =&(cublaslt_handle_); + params.common_param->head_num = head_number; + params.common_param->head_size = head_size; + params.common_param->hidden_size = head_number * head_size; + params.attn->qkv_bias = !is_position_bias; + params.attn->projection_bias = !is_position_bias; + params.attn->is_cross = is_cross; + params.attn->position_bias = is_position_bias; + params.attn->scalar = mha_op->get_scalar(); + params.attn->mask = true; auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_); + std::make_shared(input_tensor->getName(), compute_type, params, device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -123,40 +125,40 @@ template int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - int cross_tensor_offset = (params_.is_cross) ? 1 : 0; + int cross_tensor_offset = (params_.attn->is_cross) ? 1 : 0; const int weight_projection_tensor_idx = 4 + cross_tensor_offset; const int bias_projection_tensor_idx = 6 + cross_tensor_offset; const int attn_mask_tensor_idx = 7 + cross_tensor_offset; const int bias_qkv_tensor_idx = 5 + cross_tensor_offset; const int weight_qkv_tensor_idx = 3; const int position_bias_tensor_idx = 6 + cross_tensor_offset; - params_.algo = algoId; + params_.common_param->algo = algoId; void *inputs_attn[num_of_inputs_]; int index = 0; inputs_attn[index++] = const_cast(inputs[0]); - if (params_.is_cross) { + if (params_.attn->is_cross) { inputs_attn[index++] = const_cast(inputs[1]); inputs_attn[index++] = const_cast(inputs[weight_qkv_tensor_idx]); inputs_attn[index++] = const_cast(inputs[weight_qkv_tensor_idx + 1]); } else { inputs_attn[index++] = const_cast(inputs[weight_qkv_tensor_idx]); } - if (params_.qkv_bias) { + if (params_.attn->qkv_bias) { inputs_attn[index++] = const_cast(inputs[bias_qkv_tensor_idx]); } - if (params_.position_bias) { + if (params_.attn->position_bias) { inputs_attn[index++] = const_cast(inputs[position_bias_tensor_idx]); inputs_attn[index++] = const_cast(inputs[attn_mask_tensor_idx - C2NUM]); } else { inputs_attn[index++] = const_cast(inputs[attn_mask_tensor_idx]); } inputs_attn[index++] = const_cast(inputs[weight_projection_tensor_idx]); - if (params_.projection_bias) { + if (params_.attn->projection_bias) { inputs_attn[index++] = const_cast(inputs[bias_projection_tensor_idx]); } void *outputs_attn[] = {outputs[0]}; fastertransformer::forward_attn(reinterpret_cast(inputs_attn), num_of_inputs_, - reinterpret_cast(outputs_attn), num_of_outputs_, ¶ms_, workspace, GetCublasHandle(), stream); + reinterpret_cast(outputs_attn), num_of_outputs_, ¶ms_, workspace); return RET_OK; } @@ -174,15 +176,15 @@ void MhaPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { int cross_tensor_offset = 0; int position_bias_tensor_offsets = 0; - if (params_.is_cross) cross_tensor_offset = 1; - if (params_.position_bias) position_bias_tensor_offsets = 1; + if (params_.attn->is_cross) cross_tensor_offset = 1; + if (params_.attn->position_bias) position_bias_tensor_offsets = 1; const int attn_mask_tensor_idx = 7 + cross_tensor_offset - position_bias_tensor_offsets; const int request_batch_size = static_cast(in[attn_mask_tensor_idx].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[attn_mask_tensor_idx].desc.dims.d[1]); const int request_tgt_seq_len = static_cast(in[attn_mask_tensor_idx].desc.dims.d[2]); - params_.batch_size = request_batch_size; - params_.src_seq_len = request_src_seq_len; - params_.tgt_seq_len = request_tgt_seq_len; + params_.common_param->batch_size = request_batch_size; + params_.common_param->src_seq_len = request_src_seq_len; + params_.common_param->tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } @@ -205,20 +207,20 @@ nvinfer1::DimsExprs MhaPlugin::getOutputDimensions(int32_t index, const nvinfer1 if (num_dims == INPUT_SIZE2) { dims.d[0] = exprBuilder.constant(inputs[nbInputDims - 1].d[0]->getConstantValue() * inputs[nbInputDims - 1].d[1]->getConstantValue()); - auto hidden_size = exprBuilder.constant(params_.head_size * params_.head_num); + auto hidden_size = exprBuilder.constant(params_.common_param->head_size * params_.common_param->head_num); dims.d[1] = hidden_size; } else if (num_dims == INPUT_SIZE3) { dims.d[0] = inputs[nbInputDims - 1].d[0]; // batch dims.d[1] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - auto hidden_size = exprBuilder.constant(params_.head_size * params_.head_num); + auto hidden_size = exprBuilder.constant(params_.common_param->head_size * params_.common_param->head_num); dims.d[kTwo] = hidden_size; } } else { dims.nbDims = INPUT_SIZE4; dims.d[0] = inputs[nbInputDims - 1].d[0]; // batch - dims.d[1] = exprBuilder.constant(params_.head_num); + dims.d[1] = exprBuilder.constant(params_.common_param->head_num); dims.d[kTwo] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - dims.d[kThree] = exprBuilder.constant(params_.head_size); + dims.d[kThree] = exprBuilder.constant(params_.common_param->head_size); } return dims; } @@ -238,12 +240,12 @@ int MhaPlugin::initialize() noexcept { return 0; } void MhaPlugin::terminate() noexcept {} size_t MhaPlugin::getSerializationSize() const noexcept { - return sizeof(int) + sizeof(fastertransformer::attentionParamT); + return sizeof(int) + sizeof(fastertransformer::attentionParamRun); } void MhaPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); - SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::attentionParamT)); + SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::attentionParamRun)); } REGISTER_TENSORRT_CREATOR(ops::kNameAttention, MhaTensorRT) } // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h index 5cca33cc03a..b5f3616541f 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h @@ -44,24 +44,23 @@ class MhaTensorRT : public TensorRTOp { constexpr auto MHA_PLUGIN_NAME{"AttentionPlugin"}; class MhaPlugin : public TensorRTPlugin { public: - MhaPlugin(const std::string name, int compute_type, fastertransformer::attentionParamT params, + MhaPlugin(const std::string name, int compute_type, fastertransformer::attentionParamRun params, cublasLtHandle_t cublaslt_handle, uint32_t device_id) : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id), compute_type_(compute_type), - params_(params), - cublaslt_handle_(cublaslt_handle) {} + params_(params) {} MhaPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; - params_ = static_cast(fields[1].data)[0]; + params_ = static_cast(fields[1].data)[0]; } MhaPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::attentionParamT)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::attentionParamRun)); } MhaPlugin() = delete; @@ -92,7 +91,7 @@ class MhaPlugin : public TensorRTPlugin { const std::string layer_name_; std::string name_space_; int compute_type_; - mutable fastertransformer::attentionParamT params_; + mutable fastertransformer::attentionParamRun params_; cublasLtHandle_t cublaslt_handle_; int num_of_inputs_; int num_of_outputs_; diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 2f318d6c2a4..370f5e1aba9 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,3 +1,2 @@ [gpu_context] -input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128] - +input_shape=input_ids:[transformer_encoder_layer_t5,128];token_type_ids:[transformer_encoder_layer_t5,128];input_mask:[transformer_encoder_layer_t5,128] diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 8a6dd8c6f7f..2024fb6a44b 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -4,6 +4,7 @@ from symbol import parameters import git import subprocess import sys +from find_output_name import find_output_name repo = git.Repo('.', search_parent_directories=True) base = repo.working_tree_dir f = open('../../version.txt', 'r') @@ -54,18 +55,18 @@ print('loop count=',loop_count) inputs_file = open("models.txt") models_arg = inputs_file.readlines() # import subprocess -def find_output_name(ms_model, output_file): - output_name = os.popen(f"../readers/flatbuf/readfb {ms_model} -O").read() - print(output_name) - output_name = output_name[:-1] - print(output_name) - with open(output_file, 'r') as file: - data = file.read() - for i,out in enumerate(output_name.split()): - print(out) - data = data.replace('output'+str(i+1), out) - with open(output_file, 'w') as file: - file.write(data) +# def find_output_name(ms_model, output_file): +# output_name = os.popen(f"../readers/flatbuf/readfb {ms_model} -O").read() +# print(output_name) +# output_name = output_name[:-1] +# print(output_name) +# with open(output_file, 'r') as file: +# data = file.read() +# for i,out in enumerate(output_name.split()): +# print(out) +# data = data.replace('output'+str(i+1), out) +# with open(output_file, 'w') as file: +# file.write(data) for line_model_arg in models_arg: if line_model_arg[0] == '#' or line_model_arg == '\n': continue line_model_arg=line_model_arg[:-1] diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 6245a3fe07e..b8ca90851b1 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,22 +1,21 @@ -b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 -# +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #run the following tests before push --b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 --b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross --b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross --b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 --b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +#-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross # #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer @@ -44,13 +43,12 @@ #-b 16 -l 24 -H 16 -S 1024 -s 128 -P 0 -m bert #-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer -# --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1 #-b 3 -l 66 -s 20 -H 3 -S 15 -p -m mha_x2 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1 @@ -131,7 +129,7 @@ #-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m T5 # transformer tests --b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m transformer --b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m transformer --b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m transformer --b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m transformer \ No newline at end of file +#-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m transformer +#-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m transformer +#-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m transformer +#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m transformer \ No newline at end of file -- Gitee From 90a2801da485c808b35a96c236b5ee46a97c08f6 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Sun, 22 Jan 2023 15:40:48 +0200 Subject: [PATCH 32/39] fix --- .../src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc | 2 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/convert_fp32.sh | 3 +-- trc/transformer/deploy.sh | 4 +++- trc/transformer/models.txt | 6 +++--- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index b93d05bfcc4..3a0fcfefe08 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -246,7 +246,7 @@ nvinfer1::DimsExprs DecoderPlugin::getOutputDimensions(int32_t index, const nvin int num_dims = inputs[0].nbDims; dims.nbDims = num_dims; for(int i = 0; i < num_dims; i++ ) { - dims.d[i] = exprBuilder.constant(inputs[i].d[i]->getConstantValue()); + dims.d[i] = exprBuilder.constant(inputs[index].d[i]->getConstantValue()); } } return dims; diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 370f5e1aba9..099dd20effc 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_encoder_layer_t5,128];token_type_ids:[transformer_encoder_layer_t5,128];input_mask:[transformer_encoder_layer_t5,128] +input_shape=input_ids:[T5,128];token_type_ids:[T5,128];input_mask:[T5,128] diff --git a/trc/transformer/convert_fp32.sh b/trc/transformer/convert_fp32.sh index 254e491d2bd..95c5bdcf6c4 100755 --- a/trc/transformer/convert_fp32.sh +++ b/trc/transformer/convert_fp32.sh @@ -42,6 +42,5 @@ ${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64/t --modelFile=$1 \ --outputFile=${base}/trc/transformer/convv_${file_name} \ --configFile=${base}/trc/transformer/t.config \ - --encryption=false \ - ${optimize} + --encryption=false fi diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 75c998d76bb..72519726fba 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -60,7 +60,9 @@ echo ${input_files} command="cd ${PWD} && " command+="LD_LIBRARY_PATH=${system}/runtime/lib:${system}/tools/converter/lib CUDA_VISIBLE_DEVICES=${gpu_id} " # command+=" NVIDIA_TF32_OVERRIDE=0 " -command+="gdb --args ${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " +# command+="gdb --args ${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " +command+="${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " + if [ "${time}" == "" ] then command+="--inDataFile=\"${input_files}\"" diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 2f219466bb0..44780c49682 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -63,12 +63,12 @@ #-b 4 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_encoder_layer --b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5 #-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_encoder_layer_t5 # #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_decoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer -#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_decoder_layer_t5 #-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer @@ -157,7 +157,7 @@ # T5 tests -#-b 1 -l 6 -s 512 -t 512 -H 8 -S 768 -f 2048 -p 0 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m T5 #-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m T5 # #-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m T5 -- Gitee From 1050617b04c8801c27014fab6d2868825ff0e78e Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Sun, 22 Jan 2023 15:41:37 +0200 Subject: [PATCH 33/39] merge the bertbs --- .../src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index d820a059613..58771ed516c 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -197,11 +197,11 @@ int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, } void *outputs_forward[] = {outputs[0]}; fastertransformer::forwardEncoder(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, ¶ms_, -<<<<<<< HEAD workspace); -======= - workspace, cublas_handle_, stream); return RET_OK; +} + +bool EncoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, int nbOutputs) noexcept { auto type = (compute_type_ == RuntimePrecisionMode_FP16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; for (int i = 0; i < pos; i++) { -- Gitee From 68dbcd6f0cce2f531813a1d2543abaf44a8fd706 Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Sun, 22 Jan 2023 16:48:25 +0200 Subject: [PATCH 34/39] fix the params --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 4 ++-- .../delegate/tensorrt/op/decoder_tensorrt.h | 10 ++++++++ .../delegate/tensorrt/op/encoder_tensorrt.cc | 10 ++++---- .../delegate/tensorrt/op/encoder_tensorrt.h | 23 +++++++++++++++++++ .../delegate/tensorrt/op/mha_tensorrt.cc | 11 ++++++--- .../delegate/tensorrt/op/mha_tensorrt.h | 2 +- trc/transformer/cfg_bert.config | 2 +- 7 files changed, 50 insertions(+), 12 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 9c685cb8781..c19c39cd0b7 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -99,8 +99,8 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } fastertransformer::decoderParamRun params; memset_s(¶ms, sizeof(params), 0, sizeof(params)); - cublasLtHandle_t cublaslt_handle = GetCublasLtHandle(); - params.cublas_handle =&(cublaslt_handle); + cublasHandle_t cublas_handle = GetCublasHandle(); + params.cublas_handle =&(cublas_handle); params.common_param->head_num = decoder_op->get_head_num(); params.common_param->head_size = decoder_op->get_head_size(); params.common_param->hidden_size = params.common_param->head_num * params.common_param->head_size; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index ffd103636ac..943a04aa475 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -90,3 +90,13 @@ class DecoderPlugin : public TensorRTPlugin { int num_of_inputs_; int num_of_outputs_; template + int RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, + cublasGemmAlgo_t algoId); +}; +class DecoderPluginCreater : public TensorRTPluginCreater { + public: + DecoderPluginCreater() : TensorRTPluginCreater(std::string(DECODER_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_ \ No newline at end of file diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 58771ed516c..896f854e3da 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -106,8 +106,8 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } fastertransformer::encoderParamRun params; memset_s(¶ms, sizeof(params), 0, sizeof(params)); - cublasLtHandle_t cublaslt_handle_ = GetCublasLtHandle(); - params.cublas_handle =&(cublaslt_handle_); + cublasHandle_t cublas_handle = GetCublasHandle(); + params.cublas_handle =&(cublas_handle); params.common_param->head_num = encoder_op->get_head_num(); params.common_param->head_size = encoder_op->get_head_size(); params.encoder->layernorm_post = encoder_op->get_post_layernorm(); @@ -123,8 +123,8 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.encoder->has_beta = !params.attn->position_bias; params.ffn_param->ffn_bias = !params.attn->position_bias; params.attn->mask = true; - params.act_type = (fastertransformer::ActType)(encoder_op->get_act_type()); - params.attn.scale = encoder_op->get_scale(); + params.ffn_param->act_type = (fastertransformer::ActType)(encoder_op->get_act_type()); + params.attn->scale = encoder_op->get_scale(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.encoder->layernorm_post) ? C7NUM : C9NUM; @@ -190,7 +190,7 @@ int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { params_.common_param->algo = algoId; - params_.stream = stream; + params_.stream = &stream; void *inputs_forward[num_of_inputs_]; for (int i = 0; i < num_of_inputs_; i++) { inputs_forward[i] = const_cast(inputs[i]); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index cebb0fd9c04..10fa4ed2b0f 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -62,6 +62,29 @@ class EncoderPlugin : public TensorRTPlugin { EncoderPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamRun)); + } + EncoderPlugin() = delete; + + ~EncoderPlugin() override {} + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs *inputs, int nbInputDims, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override; + bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, + int nbOutputs) noexcept override; + + private: + std::string name_space_; + int compute_type_; mutable fastertransformer::encoderParamRun params_; int num_of_inputs_; int num_of_outputs_; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index 0e7904fbe4b..9e444d821f8 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -73,8 +73,8 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; fastertransformer::attentionParamRun params; memset_s(¶ms, sizeof(params), 0, sizeof(params)); - cublasLtHandle_t cublaslt_handle_ = GetCublasLtHandle(); - params.cublas_handle =&(cublaslt_handle_); + cublasHandle_t cublas_handle = GetCublasHandle(); + params.cublas_handle =&(cublas_handle); params.common_param->head_num = head_number; params.common_param->head_size = head_size; params.common_param->hidden_size = head_number * head_size; @@ -82,7 +82,7 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn->projection_bias = !is_position_bias; params.attn->is_cross = is_cross; params.attn->position_bias = is_position_bias; - params.attn->scalar = mha_op->get_scalar(); + params.attn->scale = mha_op->get_scale(); params.attn->mask = true; auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, device_id_); @@ -132,6 +132,7 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi const int weight_qkv_tensor_idx = 3; const int position_bias_tensor_idx = 6 + cross_tensor_offset; params_.common_param->algo = algoId; + params_.stream = &stream; void *inputs_attn[num_of_inputs_]; int index = 0; inputs_attn[index++] = const_cast(inputs[0]); @@ -167,7 +168,11 @@ bool MhaPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorD for (int i = 0; i < pos; i++) { if (tensorsDesc[pos].type != tensorsDesc[i].type) return false; } + bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type); return res; +} + +void MhaPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { int cross_tensor_offset = 0; int position_bias_tensor_offsets = 0; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h index b5f3616541f..5acdbaf9901 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h @@ -45,7 +45,7 @@ constexpr auto MHA_PLUGIN_NAME{"AttentionPlugin"}; class MhaPlugin : public TensorRTPlugin { public: MhaPlugin(const std::string name, int compute_type, fastertransformer::attentionParamRun params, - cublasLtHandle_t cublaslt_handle, uint32_t device_id) + uint32_t device_id) : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id), compute_type_(compute_type), params_(params) {} diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 099dd20effc..383ce7641ff 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[T5,128];token_type_ids:[T5,128];input_mask:[T5,128] +input_shape=input_ids:[transformer_decoder_layer_t5,128];token_type_ids:[transformer_decoder_layer_t5,128];input_mask:[transformer_decoder_layer_t5,128] -- Gitee From 06647aca355c8d10e82eb519dc64a84f23486cf8 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Tue, 24 Jan 2023 17:12:29 +0200 Subject: [PATCH 35/39] work --- mindspore/core/ops/decoder_layer.h | 4 +-- mindspore/core/ops/op_name.h | 4 +-- .../delegate/tensorrt/op/decoder_tensorrt.h | 2 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 3 -- .../delegate/tensorrt/op/encoder_tensorrt.h | 2 +- .../optimizer/fusion/encoder_layer_fusion.cc | 2 +- trc/transformer/cfg_bert.config | 2 +- trc/transformer/deploy.sh | 4 +++ trc/transformer/models.txt | 30 +++++++++---------- trc/transformer/t.config | 2 +- 10 files changed, 28 insertions(+), 27 deletions(-) diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h index 2be3a12074e..b196689eb2f 100644 --- a/mindspore/core/ops/decoder_layer.h +++ b/mindspore/core/ops/decoder_layer.h @@ -67,8 +67,8 @@ class MIND_API DecoderLayer : public BaseOperator { /// \param[in] ffn_hidden_size Define ffn hidden size. /// \param[in] position_bias1 Define position_bias1. /// \param[in] position_bias2 Define position_bias2. - /// \param[in] scale1 Define scalar1. - /// \param[in] scale2 Define scalar2. + /// \param[in] scale1 Define scale1. + /// \param[in] scale2 Define scale2. /// \param[in] act_type Define act_type. void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, bool post_layernorm, float scale1 = 1.0f, diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h index a76770f8ae9..968427089e6 100644 --- a/mindspore/core/ops/op_name.h +++ b/mindspore/core/ops/op_name.h @@ -395,8 +395,8 @@ constexpr auto kDecoderLayerEpsLayerNorm2 = "eps_layernorm2"; constexpr auto kDecoderLayerEpsLayerNorm3 = "eps_layernorm3"; constexpr auto kDecoderLayerPositionBias1 = "position_bias1"; constexpr auto kDecoderLayerPositionBias2 = "position_bias2"; -constexpr auto kDecoderLayerScale1 = "scalar"; -constexpr auto kDecoderLayerScale2 = "scalar"; +constexpr auto kDecoderLayerScale1 = "scale1"; +constexpr auto kDecoderLayerScale2 = "scale2"; constexpr auto kPositionBias = "position_bias"; constexpr auto KExclusive = "exclusive"; constexpr auto KReverse = "reverse"; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index 19b90242d81..3d5687948c3 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -40,7 +40,7 @@ class DecoderTensorRT : public TensorRTOp { private: nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); - bool is_ffn_fp16_ = true; + bool is_ffn_fp16_ = false; }; constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"}; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 62216a386aa..15b4f9af6e6 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -39,7 +39,6 @@ constexpr std::size_t kTwo = 2; int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { - std::cout << "Unsupported input tensor size, size is " << in_tensors.size(); if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM && in_tensors.size() != C13NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; @@ -93,8 +92,6 @@ nvinfer1::ITensor *EncoderTensorRT::castTensor(TensorRTContext *ctx, const Tenso } int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { - std::cout << "AddInnerOp\n"; - if (ctx == nullptr || ctx->network() == nullptr) { MS_LOG(ERROR) << "context or network is invalid"; return RET_ERROR; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index acecd1d38b6..bd6ea0eac80 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -40,7 +40,7 @@ class EncoderTensorRT : public TensorRTOp { private: nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); - bool is_ffn_fp16_ = true; + bool is_ffn_fp16_ = false; }; constexpr auto ENCODER_PLUGIN_NAME{"EncoderPlugin"}; diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index 56121570ce2..80246bb57ad 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -396,7 +396,7 @@ std::shared_ptr EncoderLayerFusion::CreatePrim(const FuncGrap int head_size = 0; float eps1 = 1e-5; float eps2 = 1e-5; - float scale = true; + float scale = 1.0f; if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &scale)) { return nullptr; } diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 099dd20effc..154cb54e7d0 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[T5,128];token_type_ids:[T5,128];input_mask:[T5,128] +input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128] diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 72519726fba..44f928ff382 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -58,6 +58,10 @@ echo ${server}:${PWD} echo ${input_files} #execute command="cd ${PWD} && " +# cuda=/usr/local/cuda-11.7/lib64 + +# command+="LD_LIBRARY_PATH=${system}/runtim +# e/lib:${system}/tools/converter/lib:${cuda} CUDA_VISIBLE_DEVICES=${gpu_id} " command+="LD_LIBRARY_PATH=${system}/runtime/lib:${system}/tools/converter/lib CUDA_VISIBLE_DEVICES=${gpu_id} " # command+=" NVIDIA_TF32_OVERRIDE=0 " # command+="gdb --args ${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 44780c49682..074598f0b7a 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,8 +1,8 @@ -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 @@ -12,15 +12,15 @@ #run the following tests before push -#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -#-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross -#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross -#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 -#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross +-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer -#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer -#-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer +-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer +-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 3 -H 12 -S 768 -s 128 -m T5 @@ -33,8 +33,8 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer @@ -68,7 +68,7 @@ # #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_decoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer --b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_decoder_layer_t5 #-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer @@ -155,7 +155,7 @@ #-b 64 -l 24 -H 16 -S 1024 -s 128 -m bert #-b 64 -l 24 -H 16 -S 1024 -s 512 -m bert - +#-b 1 -l 1 -s 20 -H 2 -S 4 -m bert # T5 tests #-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m T5 #-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m T5 diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 1ba94b08493..0fad133d432 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -2,4 +2,4 @@ #fusion_blacklists="MultiHeadAttentionFusion" #fusion_blacklists="EncoderLayerFusion","DecoderLayerFusion" #fusion_blacklists="DecoderLayerFusion" -#fusion_blacklists="EncoderLayerFusion" \ No newline at end of file +#fusion_blacklists="EncoderLayerFusion" -- Gitee From 50a399f47a2ebcae0c5f04aaae36da61b35ca1fd Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Wed, 25 Jan 2023 09:15:50 +0200 Subject: [PATCH 36/39] change params syntax --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 77 +++++++++++-------- .../delegate/tensorrt/op/decoder_tensorrt.h | 2 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 67 +++++++++------- .../delegate/tensorrt/op/encoder_tensorrt.h | 2 +- .../delegate/tensorrt/op/mha_tensorrt.cc | 61 ++++++++------- .../delegate/tensorrt/op/mha_tensorrt.h | 11 ++- trc/transformer/deploy.sh | 9 ++- trc/transformer/ftBench.py | 1 + trc/transformer/models.txt | 20 ++--- trc/transformer/train_transformer_export.py | 1 + 10 files changed, 143 insertions(+), 108 deletions(-) mode change 100644 => 100755 mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc mode change 100644 => 100755 mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc old mode 100644 new mode 100755 index c19c39cd0b7..344bc392abd --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -88,6 +88,7 @@ nvinfer1::ITensor *DecoderTensorRT::castTensor(TensorRTContext *ctx, const Tenso return tensor_ptr; } int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { + std::cout << "AddInnerOp" << std::endl; if (ctx == nullptr || ctx->network() == nullptr) { MS_LOG(ERROR) << "context or network is invalid"; return RET_ERROR; @@ -98,37 +99,40 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { return RET_ERROR; } fastertransformer::decoderParamRun params; - memset_s(¶ms, sizeof(params), 0, sizeof(params)); cublasHandle_t cublas_handle = GetCublasHandle(); - params.cublas_handle =&(cublas_handle); - params.common_param->head_num = decoder_op->get_head_num(); - params.common_param->head_size = decoder_op->get_head_size(); - params.common_param->hidden_size = params.common_param->head_num * params.common_param->head_size; - params.decoder->layernorm_post = decoder_op->get_post_layernorm(); - params.decoder->eps1 = decoder_op->get_eps_layernorm1(); - params.decoder->eps2 = decoder_op->get_eps_layernorm2(); - params.decoder->eps3 = decoder_op->get_eps_layernorm3(); - params.ffn_param->ffn_hidden_size = decoder_op->get_ffn_hidden_size(); - params.ffn_param->ffn_fp16 = is_ffn_fp16_; - params.attn1->position_bias = decoder_op->get_position_bias1(); - params.attn1->qkv_bias = !params.attn1->position_bias; - params.attn1->projection_bias = !params.attn1->position_bias; - params.attn1->is_cross = false; - params.attn1->scale = decoder_op->get_scale1(); - params.attn1->mask = true; - params.attn2->position_bias = decoder_op->get_position_bias2(); - params.attn2->qkv_bias = !params.attn2->position_bias; - params.attn2->projection_bias = !params.attn2->position_bias; - params.attn2->is_cross = true; - params.attn2->scale = decoder_op->get_scale2(); - params.attn2->mask = true; - params.ffn_param->act_type = (fastertransformer::ActType)(decoder_op->get_act_type()); - params.decoder->has_beta = !params.attn1->position_bias; - params.ffn_param->ffn_bias = !params.attn1->position_bias; + params.common_param.cublas_handle = cublas_handle; + params.common_param.head_num = decoder_op->get_head_num(); + params.common_param.head_size = decoder_op->get_head_size(); + params.common_param.hidden_size = params.common_param.head_num * params.common_param.head_size; + params.attn1.common_param = ¶ms.common_param; + params.attn2.common_param = ¶ms.common_param; + params.ffn_param.common_param = ¶ms.common_param; + + params.decoder.layernorm_post = decoder_op->get_post_layernorm(); + params.decoder.eps1 = decoder_op->get_eps_layernorm1(); + params.decoder.eps2 = decoder_op->get_eps_layernorm2(); + params.decoder.eps3 = decoder_op->get_eps_layernorm3(); + params.ffn_param.ffn_param.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); + params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_; + params.ffn_param.ffn_param.act_type = (fastertransformer::ActType)(decoder_op->get_act_type()); + params.ffn_param.ffn_param.ffn_bias = !params.attn1.attn.position_bias; + params.attn1.attn.position_bias = decoder_op->get_position_bias1(); + params.attn1.attn.qkv_bias = !params.attn1.attn.position_bias; + params.attn1.attn.projection_bias = !params.attn1.attn.position_bias; + params.attn1.attn.is_cross = false; + params.attn1.attn.scale = decoder_op->get_scale1(); + params.attn1.attn.mask = true; + params.attn2.attn.position_bias = decoder_op->get_position_bias2(); + params.attn2.attn.qkv_bias = !params.attn2.attn.position_bias; + params.attn2.attn.projection_bias = !params.attn2.attn.position_bias; + params.attn2.attn.is_cross = true; + params.attn2.attn.scale = decoder_op->get_scale2(); + params.attn2.attn.mask = true; + params.decoder.has_beta = !params.attn1.attn.position_bias; auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { - size_t start_fp16 = (params.attn1->position_bias) ? C13NUM : C18NUM; - size_t end_fp16 = (params.attn1->position_bias) ? C16NUM : C22NUM; + size_t start_fp16 = (params.attn1.attn.position_bias) ? C13NUM : C18NUM; + size_t end_fp16 = (params.attn1.attn.position_bias) ? C16NUM : C22NUM; for (size_t i = 0; i < in_tensors_.size(); i++) { auto in_tensor = input(ctx, i); if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) { @@ -184,8 +188,8 @@ template int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - params_.common_param->algo = algoId; - params_.stream = &stream; + params_.common_param.algo = algoId; + params_.common_param.stream = stream; void *inputs_forward[num_of_inputs_]; for (int i = 0; i < num_of_inputs_; i++) { inputs_forward[i] = const_cast(inputs[i]); @@ -211,15 +215,22 @@ void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, const int request_batch_size = static_cast(in[0].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); const int request_tgt_seq_len = request_src_seq_len; - params_.common_param->batch_size = request_batch_size; - params_.common_param->src_seq_len = request_src_seq_len; - params_.common_param->tgt_seq_len = request_tgt_seq_len; + params_.common_param.batch_size = request_batch_size; + params_.common_param.src_seq_len = request_src_seq_len; + params_.common_param.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; + params_.attn1.common_param = ¶ms_.common_param; + params_.attn2.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; + } size_t DecoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { if (compute_type_ == RuntimePrecisionMode_FP16) { + params_.attn1.common_param = ¶ms_.common_param; + params_.attn2.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); } else { return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index 943a04aa475..bcaeafdff5d 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -56,7 +56,7 @@ class DecoderPlugin : public TensorRTPlugin { : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; - params_ = static_cast(fields[1].data)[0]; + params_ = static_cast(fields[1].data)[0]; } DecoderPlugin(const char *name, const void *serialData, size_t serialLength) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc old mode 100644 new mode 100755 index 896f854e3da..4dedfdc8f26 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -39,7 +39,6 @@ constexpr std::size_t kTwo = 2; int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { - std::cout << "Unsupported input tensor size, size is " << in_tensors.size(); if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM && in_tensors.size() != C13NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; @@ -104,32 +103,37 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { MS_LOG(ERROR) << "op action convert failed"; return RET_ERROR; } - fastertransformer::encoderParamRun params; - memset_s(¶ms, sizeof(params), 0, sizeof(params)); cublasHandle_t cublas_handle = GetCublasHandle(); - params.cublas_handle =&(cublas_handle); - params.common_param->head_num = encoder_op->get_head_num(); - params.common_param->head_size = encoder_op->get_head_size(); - params.encoder->layernorm_post = encoder_op->get_post_layernorm(); - params.encoder->eps1 = encoder_op->get_eps_layernorm1(); - params.encoder->eps2 = encoder_op->get_eps_layernorm2(); - params.ffn_param->ffn_hidden_size = encoder_op->get_ffn_hidden_size(); - params.ffn_param->ffn_fp16 = is_ffn_fp16_; - params.common_param->hidden_size = params.common_param->head_num * params.common_param->head_size; - params.attn->is_cross = false; - params.attn->position_bias = encoder_op->get_position_bias(); - params.attn->projection_bias = !params.attn->position_bias; - params.attn->qkv_bias = !params.attn->position_bias; - params.encoder->has_beta = !params.attn->position_bias; - params.ffn_param->ffn_bias = !params.attn->position_bias; - params.attn->mask = true; - params.ffn_param->act_type = (fastertransformer::ActType)(encoder_op->get_act_type()); - params.attn->scale = encoder_op->get_scale(); + fastertransformer::encoderParamRun params; + //update commonparam + params.common_param.cublas_handle =cublas_handle; + params.common_param.head_num = encoder_op->get_head_num(); + params.common_param.head_size = encoder_op->get_head_size(); + params.common_param.hidden_size = params.common_param.head_num * params.common_param.head_size; + //connect commonparam to attention and ffn + params.attn.common_param = ¶ms.common_param; + params.ffn_param.common_param = ¶ms.common_param; + //update encoder_param_ + params.encoder.layernorm_post = encoder_op->get_post_layernorm(); + params.encoder.eps1 = encoder_op->get_eps_layernorm1(); + params.encoder.eps2 = encoder_op->get_eps_layernorm2(); + params.ffn_param.ffn_param.ffn_hidden_size = encoder_op->get_ffn_hidden_size(); + params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_; + params.attn.attn.is_cross = false; + params.attn.attn.position_bias = encoder_op->get_position_bias(); + std::cout << "params.attn.attn.position_bias" << params.attn.attn.position_bias << std::endl; + params.attn.attn.projection_bias = !params.attn.attn.position_bias; + params.attn.attn.qkv_bias = !params.attn.attn.position_bias; + params.encoder.has_beta = !params.attn.attn.position_bias; + params.ffn_param.ffn_param.ffn_bias = !params.attn.attn.position_bias; + params.attn.attn.mask = true; + params.ffn_param.ffn_param.act_type = (fastertransformer::ActType)(encoder_op->get_act_type()); + params.attn.attn.scale = encoder_op->get_scale(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { - size_t start_fp16 = (params.encoder->layernorm_post) ? C7NUM : C9NUM; - size_t end_fp16 = (params.encoder->layernorm_post) ? C11NUM : C13NUM; - if (params.attn->position_bias) { + size_t start_fp16 = (params.encoder.layernorm_post) ? C7NUM : C9NUM; + size_t end_fp16 = (params.encoder.layernorm_post) ? C11NUM : C13NUM; + if (params.attn.attn.position_bias) { start_fp16 = C6NUM; end_fp16 = C9NUM; } @@ -189,8 +193,8 @@ template int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - params_.common_param->algo = algoId; - params_.stream = &stream; + params_.common_param.algo = algoId; + params_.common_param.stream = stream; void *inputs_forward[num_of_inputs_]; for (int i = 0; i < num_of_inputs_; i++) { inputs_forward[i] = const_cast(inputs[i]); @@ -216,15 +220,20 @@ void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, const int request_batch_size = static_cast(in[0].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); const int request_tgt_seq_len = request_src_seq_len; - params_.common_param->batch_size = request_batch_size; - params_.common_param->src_seq_len = request_src_seq_len; - params_.common_param->tgt_seq_len = request_tgt_seq_len; + params_.common_param.batch_size = request_batch_size; + params_.common_param.src_seq_len = request_src_seq_len; + params_.common_param.tgt_seq_len = request_tgt_seq_len; + params_.attn.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; + std::cout << "params.attn.attn.position_bias" << params_.attn.attn.position_bias << std::endl; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } size_t EncoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { if (compute_type_ == RuntimePrecisionMode_FP16) { + params_.attn.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; return fastertransformer::GetEncoderLayerWorkspaceSize(¶ms_); } else { return fastertransformer::GetEncoderLayerWorkspaceSize(¶ms_); diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index 10fa4ed2b0f..fbe2f6c674a 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -62,7 +62,7 @@ class EncoderPlugin : public TensorRTPlugin { EncoderPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); - DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamRun)); + DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::encoderParamRun)); } EncoderPlugin() = delete; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index 9e444d821f8..5646ec5472d 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -72,20 +72,22 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { bool is_position_bias = mha_op->get_position_bias(); nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; fastertransformer::attentionParamRun params; + fastertransformer::CommonParam common_param; + memset_s(&common_param, sizeof(common_param), 0, sizeof(common_param)); memset_s(¶ms, sizeof(params), 0, sizeof(params)); cublasHandle_t cublas_handle = GetCublasHandle(); - params.cublas_handle =&(cublas_handle); - params.common_param->head_num = head_number; - params.common_param->head_size = head_size; - params.common_param->hidden_size = head_number * head_size; - params.attn->qkv_bias = !is_position_bias; - params.attn->projection_bias = !is_position_bias; - params.attn->is_cross = is_cross; - params.attn->position_bias = is_position_bias; - params.attn->scale = mha_op->get_scale(); - params.attn->mask = true; + common_param.cublas_handle =cublas_handle; + common_param.head_num = head_number; + common_param.head_size = head_size; + common_param.hidden_size = head_number * head_size; + params.attn.qkv_bias = !is_position_bias; + params.attn.projection_bias = !is_position_bias; + params.attn.is_cross = is_cross; + params.attn.position_bias = is_position_bias; + params.attn.scale = mha_op->get_scale(); + params.attn.mask = true; auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, device_id_); + std::make_shared(input_tensor->getName(), compute_type, params,common_param, device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -124,36 +126,37 @@ template int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) { - int cross_tensor_offset = (params_.attn->is_cross) ? 1 : 0; + int cross_tensor_offset = (params_.attn.is_cross) ? 1 : 0; const int weight_projection_tensor_idx = 4 + cross_tensor_offset; const int bias_projection_tensor_idx = 6 + cross_tensor_offset; const int attn_mask_tensor_idx = 7 + cross_tensor_offset; const int bias_qkv_tensor_idx = 5 + cross_tensor_offset; const int weight_qkv_tensor_idx = 3; const int position_bias_tensor_idx = 6 + cross_tensor_offset; - params_.common_param->algo = algoId; - params_.stream = &stream; + common_param_.algo = algoId; + common_param_.stream = stream; + params_.common_param = &common_param_; void *inputs_attn[num_of_inputs_]; int index = 0; inputs_attn[index++] = const_cast(inputs[0]); - if (params_.attn->is_cross) { + if (params_.attn.is_cross) { inputs_attn[index++] = const_cast(inputs[1]); inputs_attn[index++] = const_cast(inputs[weight_qkv_tensor_idx]); inputs_attn[index++] = const_cast(inputs[weight_qkv_tensor_idx + 1]); } else { inputs_attn[index++] = const_cast(inputs[weight_qkv_tensor_idx]); } - if (params_.attn->qkv_bias) { + if (params_.attn.qkv_bias) { inputs_attn[index++] = const_cast(inputs[bias_qkv_tensor_idx]); } - if (params_.attn->position_bias) { + if (params_.attn.position_bias) { inputs_attn[index++] = const_cast(inputs[position_bias_tensor_idx]); inputs_attn[index++] = const_cast(inputs[attn_mask_tensor_idx - C2NUM]); } else { inputs_attn[index++] = const_cast(inputs[attn_mask_tensor_idx]); } inputs_attn[index++] = const_cast(inputs[weight_projection_tensor_idx]); - if (params_.attn->projection_bias) { + if (params_.attn.projection_bias) { inputs_attn[index++] = const_cast(inputs[bias_projection_tensor_idx]); } void *outputs_attn[] = {outputs[0]}; @@ -176,17 +179,18 @@ void MhaPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { int cross_tensor_offset = 0; int position_bias_tensor_offsets = 0; - if (params_.attn->is_cross) cross_tensor_offset = 1; - if (params_.attn->position_bias) position_bias_tensor_offsets = 1; + if (params_.attn.is_cross) cross_tensor_offset = 1; + if (params_.attn.position_bias) position_bias_tensor_offsets = 1; const int attn_mask_tensor_idx = 7 + cross_tensor_offset - position_bias_tensor_offsets; const int request_batch_size = static_cast(in[attn_mask_tensor_idx].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[attn_mask_tensor_idx].desc.dims.d[1]); const int request_tgt_seq_len = static_cast(in[attn_mask_tensor_idx].desc.dims.d[2]); - params_.common_param->batch_size = request_batch_size; - params_.common_param->src_seq_len = request_src_seq_len; - params_.common_param->tgt_seq_len = request_tgt_seq_len; + common_param_.batch_size = request_batch_size; + common_param_.src_seq_len = request_src_seq_len; + common_param_.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; + params_.common_param = &common_param_; } size_t MhaPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, @@ -207,20 +211,20 @@ nvinfer1::DimsExprs MhaPlugin::getOutputDimensions(int32_t index, const nvinfer1 if (num_dims == INPUT_SIZE2) { dims.d[0] = exprBuilder.constant(inputs[nbInputDims - 1].d[0]->getConstantValue() * inputs[nbInputDims - 1].d[1]->getConstantValue()); - auto hidden_size = exprBuilder.constant(params_.common_param->head_size * params_.common_param->head_num); + auto hidden_size = exprBuilder.constant(common_param_.head_size * common_param_.head_num); dims.d[1] = hidden_size; } else if (num_dims == INPUT_SIZE3) { dims.d[0] = inputs[nbInputDims - 1].d[0]; // batch dims.d[1] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - auto hidden_size = exprBuilder.constant(params_.common_param->head_size * params_.common_param->head_num); + auto hidden_size = exprBuilder.constant(common_param_.head_size * common_param_.head_num); dims.d[kTwo] = hidden_size; } } else { dims.nbDims = INPUT_SIZE4; dims.d[0] = inputs[nbInputDims - 1].d[0]; // batch - dims.d[1] = exprBuilder.constant(params_.common_param->head_num); + dims.d[1] = exprBuilder.constant(common_param_.head_num); dims.d[kTwo] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1]; - dims.d[kThree] = exprBuilder.constant(params_.common_param->head_size); + dims.d[kThree] = exprBuilder.constant(common_param_.head_size); } return dims; } @@ -240,12 +244,13 @@ int MhaPlugin::initialize() noexcept { return 0; } void MhaPlugin::terminate() noexcept {} size_t MhaPlugin::getSerializationSize() const noexcept { - return sizeof(int) + sizeof(fastertransformer::attentionParamRun); + return sizeof(int) + sizeof(fastertransformer::attentionParamRun) + sizeof(fastertransformer::CommonParam); } void MhaPlugin::serialize(void *buffer) const noexcept { SerializeValue(&buffer, &compute_type_, sizeof(int)); SerializeValue(&buffer, ¶ms_, sizeof(fastertransformer::attentionParamRun)); + SerializeValue(&buffer, &common_param_, sizeof(fastertransformer::CommonParam)); } REGISTER_TENSORRT_CREATOR(ops::kNameAttention, MhaTensorRT) } // namespace mindspore::lite diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h index 5acdbaf9901..065a44bbf88 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h @@ -45,22 +45,28 @@ constexpr auto MHA_PLUGIN_NAME{"AttentionPlugin"}; class MhaPlugin : public TensorRTPlugin { public: MhaPlugin(const std::string name, int compute_type, fastertransformer::attentionParamRun params, + fastertransformer::CommonParam common_param, uint32_t device_id) : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id), compute_type_(compute_type), - params_(params) {} + params_(params), + common_param_(common_param){ + params_.common_param = &common_param_; + } MhaPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { const nvinfer1::PluginField *fields = fc->fields; compute_type_ = static_cast(fields[0].data)[0]; params_ = static_cast(fields[1].data)[0]; + common_param_ = static_cast(fields[2].data)[0]; } MhaPlugin(const char *name, const void *serialData, size_t serialLength) : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) { DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int)); DeserializeValue(&serialData, &serialLength, ¶ms_, sizeof(fastertransformer::attentionParamRun)); + DeserializeValue(&serialData, &serialLength, &common_param_, sizeof(fastertransformer::CommonParam)); } MhaPlugin() = delete; @@ -92,7 +98,8 @@ class MhaPlugin : public TensorRTPlugin { std::string name_space_; int compute_type_; mutable fastertransformer::attentionParamRun params_; - cublasLtHandle_t cublaslt_handle_; + mutable fastertransformer::CommonParam common_param_; + cublasLtHandle_t* cublaslt_handle_; int num_of_inputs_; int num_of_outputs_; }; diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 72519726fba..d7f14f7d8ac 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -58,11 +58,12 @@ echo ${server}:${PWD} echo ${input_files} #execute command="cd ${PWD} && " -command+="LD_LIBRARY_PATH=${system}/runtime/lib:${system}/tools/converter/lib CUDA_VISIBLE_DEVICES=${gpu_id} " +cuda=/usr/local/cuda-11.7/lib64 + +command+="LD_LIBRARY_PATH=${system}/runtime/lib:${system}/tools/converter/lib:${cuda} CUDA_VISIBLE_DEVICES=${gpu_id} " # command+=" NVIDIA_TF32_OVERRIDE=0 " -# command+="gdb --args ${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " command+="${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " - +#command+="gdb --args ${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " if [ "${time}" == "" ] then command+="--inDataFile=\"${input_files}\"" @@ -79,4 +80,4 @@ echo ${command} > execute.sh rsync -v execute.sh ${server}:${PWD} ssh ${server} ${command} - \ No newline at end of file + diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index 2024fb6a44b..40fe38d3e2b 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -5,6 +5,7 @@ import git import subprocess import sys from find_output_name import find_output_name +from get_output_bert import get_output_bert repo = git.Repo('.', search_parent_directories=True) base = repo.working_tree_dir f = open('../../version.txt', 'r') diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 56f71f36c65..6930c46dbf5 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,16 +1,16 @@ --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 -# -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 + +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #run the following tests before push - +#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross #-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross #-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross @@ -32,8 +32,8 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer @@ -66,7 +66,7 @@ # #-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_decoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer --b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_decoder_layer_t5 #-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer_t5 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer @@ -98,7 +98,7 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert #-b 1 -l 2 -H 2 -S 2 -s 128 -m T5 -#-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert +#-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P 1 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -m bert #-b 8 -l 12 -H 12 -S 768 -s 128 -m bert @@ -140,4 +140,4 @@ #-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m transformer #-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m transformer #-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m transformer -#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m transformer \ No newline at end of file +#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m transformer diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index dcfc7de093a..275aeaf906b 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -350,6 +350,7 @@ def transformer_encoder_layer_create(): y = pruneTensor(y,seq_len,1) saveCalib(out_name, np.array(y), f_y) print("y.shape",np.array(y).shape) + f_y.close() # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims elif app=="trc": -- Gitee From 72fa3bb6711face2906993283b9b27259d55afd3 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 25 Jan 2023 12:11:35 +0200 Subject: [PATCH 37/39] merge --- trc/transformer/cfg_bert.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 154cb54e7d0..b496e5915bc 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128] +input_shape=input_ids:[mha_T5_cross,128];token_type_ids:[mha_T5_cross,128];input_mask:[mha_T5_cross,128] -- Gitee From 3616efd54359edb4448e1fc791c45bae098f472f Mon Sep 17 00:00:00 2001 From: shira zaloshinki Date: Wed, 25 Jan 2023 14:13:54 +0200 Subject: [PATCH 38/39] fix issue --- .../delegate/tensorrt/op/decoder_tensorrt.cc | 21 ++++++++----------- .../delegate/tensorrt/op/decoder_tensorrt.h | 2 +- .../delegate/tensorrt/op/encoder_tensorrt.cc | 4 ---- .../delegate/tensorrt/op/mha_tensorrt.cc | 7 +++---- trc/transformer/cfg_bert.config | 2 +- trc/transformer/models.txt | 15 +++++++------ 6 files changed, 21 insertions(+), 30 deletions(-) diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 344bc392abd..1facb4e5013 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -88,7 +88,6 @@ nvinfer1::ITensor *DecoderTensorRT::castTensor(TensorRTContext *ctx, const Tenso return tensor_ptr; } int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { - std::cout << "AddInnerOp" << std::endl; if (ctx == nullptr || ctx->network() == nullptr) { MS_LOG(ERROR) << "context or network is invalid"; return RET_ERROR; @@ -107,7 +106,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn1.common_param = ¶ms.common_param; params.attn2.common_param = ¶ms.common_param; params.ffn_param.common_param = ¶ms.common_param; - + params.decoder.layernorm_post = decoder_op->get_post_layernorm(); params.decoder.eps1 = decoder_op->get_eps_layernorm1(); params.decoder.eps2 = decoder_op->get_eps_layernorm2(); @@ -115,8 +114,8 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.ffn_param.ffn_param.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_; params.ffn_param.ffn_param.act_type = (fastertransformer::ActType)(decoder_op->get_act_type()); - params.ffn_param.ffn_param.ffn_bias = !params.attn1.attn.position_bias; params.attn1.attn.position_bias = decoder_op->get_position_bias1(); + params.ffn_param.ffn_param.ffn_bias = !params.attn1.attn.position_bias; params.attn1.attn.qkv_bias = !params.attn1.attn.position_bias; params.attn1.attn.projection_bias = !params.attn1.attn.position_bias; params.attn1.attn.is_cross = false; @@ -147,8 +146,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } } nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; - auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params, device_id_); + auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -182,7 +180,7 @@ int DecoderPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nv } else { return RunCudaDecoder(inputDesc, outputDesc, inputs, outputs, workspace, stream, CUBLAS_GEMM_DEFAULT_TENSOR_OP); -} + } } template int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, @@ -223,14 +221,13 @@ void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, params_.attn1.common_param = ¶ms_.common_param; params_.attn2.common_param = ¶ms_.common_param; params_.ffn_param.common_param = ¶ms_.common_param; - } size_t DecoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { if (compute_type_ == RuntimePrecisionMode_FP16) { - params_.attn1.common_param = ¶ms_.common_param; - params_.attn2.common_param = ¶ms_.common_param; - params_.ffn_param.common_param = ¶ms_.common_param; + params_.attn1.common_param = ¶ms_.common_param; + params_.attn2.common_param = ¶ms_.common_param; + params_.ffn_param.common_param = ¶ms_.common_param; return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); } else { return fastertransformer::GetDecoderLayerWorkspaceSize(¶ms_); @@ -243,7 +240,7 @@ nvinfer1::DimsExprs DecoderPlugin::getOutputDimensions(int32_t index, const nvin if (index == 0) { int num_dims = inputs[0].nbDims; dims.nbDims = num_dims; - for(int i = 0; i < num_dims; i++ ) { + for (int i = 0; i < num_dims; i++) { dims.d[i] = exprBuilder.constant(inputs[index].d[i]->getConstantValue()); } } @@ -255,7 +252,7 @@ nvinfer1::IPluginV2DynamicExt *DecoderPlugin::clone() const noexcept { if (plugin == nullptr) { MS_LOG(ERROR) << "plugin is null"; return nullptr; -} + } plugin->setPluginNamespace(name_space_.c_str()); return plugin; } diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h index bcaeafdff5d..051adfa442a 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h @@ -40,7 +40,7 @@ class DecoderTensorRT : public TensorRTOp { private: nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name); - bool is_ffn_fp16_ = true; + bool is_ffn_fp16_ = false; }; constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"}; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 4dedfdc8f26..bc04e254ba7 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -92,8 +92,6 @@ nvinfer1::ITensor *EncoderTensorRT::castTensor(TensorRTContext *ctx, const Tenso } int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { - std::cout << "AddInnerOp\n"; - if (ctx == nullptr || ctx->network() == nullptr) { MS_LOG(ERROR) << "context or network is invalid"; return RET_ERROR; @@ -121,7 +119,6 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_; params.attn.attn.is_cross = false; params.attn.attn.position_bias = encoder_op->get_position_bias(); - std::cout << "params.attn.attn.position_bias" << params.attn.attn.position_bias << std::endl; params.attn.attn.projection_bias = !params.attn.attn.position_bias; params.attn.attn.qkv_bias = !params.attn.attn.position_bias; params.encoder.has_beta = !params.attn.attn.position_bias; @@ -225,7 +222,6 @@ void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, params_.common_param.tgt_seq_len = request_tgt_seq_len; params_.attn.common_param = ¶ms_.common_param; params_.ffn_param.common_param = ¶ms_.common_param; - std::cout << "params.attn.attn.position_bias" << params_.attn.attn.position_bias << std::endl; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index 5646ec5472d..9d70028ad62 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -76,7 +76,7 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { memset_s(&common_param, sizeof(common_param), 0, sizeof(common_param)); memset_s(¶ms, sizeof(params), 0, sizeof(params)); cublasHandle_t cublas_handle = GetCublasHandle(); - common_param.cublas_handle =cublas_handle; + common_param.cublas_handle = cublas_handle; common_param.head_num = head_number; common_param.head_size = head_size; common_param.hidden_size = head_number * head_size; @@ -86,8 +86,7 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn.position_bias = is_position_bias; params.attn.scale = mha_op->get_scale(); params.attn.mask = true; - auto plugin = - std::make_shared(input_tensor->getName(), compute_type, params,common_param, device_id_); + auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, common_param, device_id_); const int input_number = inputs().size(); nvinfer1::ITensor *inputTensors[input_number]; for (int i = 0; i < input_number; i++) { @@ -190,7 +189,7 @@ void MhaPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int common_param_.tgt_seq_len = request_tgt_seq_len; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; - params_.common_param = &common_param_; + params_.common_param = &common_param_; } size_t MhaPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 383ce7641ff..099dd20effc 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[transformer_decoder_layer_t5,128];token_type_ids:[transformer_decoder_layer_t5,128];input_mask:[transformer_decoder_layer_t5,128] +input_shape=input_ids:[T5,128];token_type_ids:[T5,128];input_mask:[T5,128] diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 6930c46dbf5..661f2c2d2e8 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,12 +1,11 @@ --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 -#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 - +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 #run the following tests before push @@ -32,8 +31,8 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer -- Gitee From 231ba0b96ad31fb665a712e625b85ca203fbef98 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 25 Jan 2023 14:16:08 +0200 Subject: [PATCH 39/39] merg --- trc/transformer/cfg_bert.config | 2 +- trc/transformer/models.txt | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index b496e5915bc..cc543ad3d77 100755 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,2 @@ [gpu_context] -input_shape=input_ids:[mha_T5_cross,128];token_type_ids:[mha_T5_cross,128];input_mask:[mha_T5_cross,128] +input_shape=input_ids:[transformer_decoder_layer,128];token_type_ids:[transformer_decoder_layer,128];input_mask:[transformer_decoder_layer,128] diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index 1a5ddbf9251..91b0d008fc6 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,26 +1,28 @@ --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 # --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 + +-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer #run the following tests before push --b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 --b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross --b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross --b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 --b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +#-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer --b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer --b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert #-b 1 -l 3 -H 12 -S 768 -s 128 -m T5 -- Gitee