From a8a815a5c49ff343fccfd8e51f8f67a0bf5c7d7b Mon Sep 17 00:00:00 2001 From: yoni Date: Wed, 11 Jan 2023 11:51:37 +0200 Subject: [PATCH 1/8] decode input_ids to sequence length --- .../kernels/bert_preprocess_kernels.cu | 33 +++++++++++++++---- .../kernels/bert_preprocess_kernels.h | 5 ++- .../layers/encoder_layers/encoder.cc | 11 +++---- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu index 9976d50..3fef3b2 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu @@ -58,7 +58,7 @@ void invokeGetPaddingOffset(size_t* h_token_num, } template -__global__ void buildSequnceLength(const T * input, int *sequnce_length, const int max_seq_length, const int hidden_size) { +__global__ void buildSequnceLength(const T * input, int *sequence_length, const int max_seq_length, const int hidden_size) { __shared__ int s_max_val; int bid = blockIdx.x; const T * seq_base = input + bid* max_seq_length * hidden_size; @@ -75,9 +75,27 @@ __global__ void buildSequnceLength(const T * input, int *sequnce_length, const i s_max_val = max_val; } __syncthreads(); - sequnce_length[bid] = -s_max_val; + sequence_length[bid] = -s_max_val; } +__global__ void buildSequnceLength(const int *input, int *sequence_length, const int max_seq_length) { + __shared__ int s_max_val; + int bid = blockIdx.x; + int last = 0; + const int *base = input + bid * max_seq_length; + for (int i=threadIdx.x ; i < max_seq_length; i += blockDim.x) { + const int *ptr = base + i; + if (*ptr != 0){ + last = i; + } + } + int max_val = blockReduceMax(last); + if (threadIdx.x == 0) { + s_max_val = max_val + 1; + } + __syncthreads(); + sequence_length[bid] = s_max_val; +} @@ -143,10 +161,14 @@ __global__ void getTrtPaddingOffsetKernel(int* trt_mha_padding_offset, const int template -void invokeBuildSequnceLength(const T * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream) { +void invokeBuildSequenceLength(const T * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream) { buildSequnceLength<<>>(input,sequnce_length, max_seq_length,hidden_size); } +void invokeBuildSequenceLength(const int * input, int batch_size, int *sequnce_length, int max_seq_length,cudaStream_t stream) { + buildSequnceLength<<>>(input,sequnce_length, max_seq_length); +} + @@ -339,9 +361,8 @@ void invokeBuildRelativeAttentionBias(T* relative_attention_bias, is_bidirectional, max_distance); } -template void invokeBuildSequnceLength(const float * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream); -template void invokeBuildSequnceLength(const half * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream); - +template void invokeBuildSequenceLength(const float * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream); +template void invokeBuildSequenceLength(const half * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream); template void invokeBuildRelativeAttentionBias(float* relative_attention_bias, const float* relative_attention_bias_table, diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.h b/src/fastertransformer/kernels/bert_preprocess_kernels.h index dca4ef5..8d6b772 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.h +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.h @@ -35,7 +35,10 @@ void invokeBuildEncoderAttentionMask( T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream); template -void invokeBuildSequnceLength(const T * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream); +void invokeBuildSequenceLength(const T * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream); + +void invokeBuildSequenceLength(const int* input, int batch_size, int *sequnce_length, int max_seq_length,cudaStream_t stream); + void invokeGetTrtPaddingOffset(int* trt_mha_padding_offset, const int* sequence_length, diff --git a/src/fastertransformer/layers/encoder_layers/encoder.cc b/src/fastertransformer/layers/encoder_layers/encoder.cc index 004718e..e9b3f59 100644 --- a/src/fastertransformer/layers/encoder_layers/encoder.cc +++ b/src/fastertransformer/layers/encoder_layers/encoder.cc @@ -14,12 +14,12 @@ namespace fastertransformer { #define ALIGN_SIZE 16 template -void printTensor(const std::string& str, T* input, int size) +void printTensor(const std::string& str, T* input, const int size) { std::cout << str; T* input_device = input; auto input_host = std::make_unique(size); - cudaD2Hcpy(input_host.get(), input_device, size); + cudaD2Hcpy(input_host.get(), input_device,size); for (int k = 0, index = 0; k < size; k++) { if (index != 0) std::cout << ','; @@ -345,6 +345,7 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc T* input_tensor = reinterpret_cast(inputs[param->in_idx++]); T* from_tensor = input_tensor; T* compress_buffer; + int *input_ids = reinterpret_cast(inputs[in_len-1]); compress_buffer = reinterpret_cast(ws); ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(h_token_num * param->hidden_size,ALIGN_SIZE)); int* padding_offset = reinterpret_cast(ws); @@ -354,9 +355,8 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(param->batch_size,ALIGN_SIZE)); size_t* d_token_num = reinterpret_cast(ws); ws = reinterpret_cast(reinterpret_cast(ws) + ALIGN(1,ALIGN_SIZE)); - invokeBuildSequnceLength( - from_tensor, param->batch_size, d_sequence_lengths, param->src_seq_len, param->hidden_size, param->stream); - // printTensor("seq_len=",d_sequence_lengths,param->batch_size); + invokeBuildSequenceLength( + input_ids, param->batch_size, d_sequence_lengths, param->src_seq_len, param->stream); invokeGetPaddingOffset(&h_token_num, d_token_num, padding_offset, @@ -364,7 +364,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->batch_size, param->src_seq_len, param->stream); - // std::cout << "token=" << h_token_num << "m=" << param->batch_size * param->src_seq_len << std::endl; if (h_token_num * 2 <= param->batch_size * param->src_seq_len) { param->eft = true; invokeRemovePadding(compress_buffer, -- Gitee From 2b3622c50bc1ae9a12c21162c712e038ff475bf8 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Sun, 5 Feb 2023 11:47:40 +0200 Subject: [PATCH 2/8] T5 work with vsl --- examples/cpp/ms/initialize.h | 10 +++ .../kernels/bert_preprocess_kernels.cu | 45 ++++++---- .../kernels/bert_preprocess_kernels.h | 4 +- .../kernels/unfused_attention_kernels.cu | 12 +-- .../kernels/unfused_attention_kernels.h | 1 + .../layers/ms_layers/MSEncoderLayer.cc | 10 ++- .../layers/ms_layers/attention.cc | 59 ++++++++---- .../layers/ms_layers/decoder.cc | 90 +++++++++++-------- .../layers/ms_layers/decoder.h | 2 +- .../layers/ms_layers/encoder.cc | 42 ++++----- .../layers/ms_layers/encoder.h | 2 +- .../layers/ms_layers/param.h | 1 + src/fastertransformer/models/bert/Bert.cc | 4 +- .../models/bert_int8/BertINT8.cc | 4 +- src/fastertransformer/models/vit/ViT.cc | 2 +- .../models/vit_int8/ViTINT8.cc | 2 +- 16 files changed, 173 insertions(+), 117 deletions(-) diff --git a/examples/cpp/ms/initialize.h b/examples/cpp/ms/initialize.h index 899607a..20b6069 100644 --- a/examples/cpp/ms/initialize.h +++ b/examples/cpp/ms/initialize.h @@ -481,6 +481,11 @@ void InitializeEncoderT5(opt_arg* opt_a, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); + desc.input_tensors.push_back( + Tensor{MEMORY_GPU, + getTensorType(), + std::vector{opt_a->batch_size, opt_a->seq_len}, + 0}); desc.input_python_tensors.push_back(Tensor{ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); desc.input_python_tensors.push_back(Tensor{ @@ -490,6 +495,11 @@ void InitializeEncoderT5(opt_arg* opt_a, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0}); + desc.input_python_tensors.push_back( + Tensor{MEMORY_CPU, + getTensorType(), + std::vector{opt_a->batch_size, opt_a->seq_len}, + 0}); desc.output_tensors.push_back(Tensor{ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0}); diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu index 715fb78..61e4fc1 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu @@ -101,15 +101,15 @@ __global__ void buildSequnceLength(const int *input, int *sequence_length, const template -__global__ void buildEncoderAttentionMaskKernel(T* attention_mask, const int* sequence_lengths, const int max_seq_len) +__global__ void buildEncoderAttentionMaskKernel(T* attention_mask, const int* sequence_lengths, const int src_seq_len, const int tgt_seq_len) { // sequence_lengths: [batch_size] // attention_mask: [batch_size, 1, max_seq_len, max_seq_len] - attention_mask += blockIdx.x * max_seq_len * max_seq_len; + attention_mask += blockIdx.x * src_seq_len * tgt_seq_len; const int length = sequence_lengths[blockIdx.x]; - for (int i = threadIdx.x; i < max_seq_len * max_seq_len; i += blockDim.x) { + for (int i = threadIdx.x; i < src_seq_len * tgt_seq_len; i += blockDim.x) { // int row_id = i / max_seq_len; - int col_id = i % max_seq_len; + int col_id = i % tgt_seq_len; // if (row_id < length && col_id < length) { // TODO (bhsueh) check this modification is ok or not on other rmodel if (col_id >= length) { @@ -119,56 +119,63 @@ __global__ void buildEncoderAttentionMaskKernel(T* attention_mask, const int* se } template -__global__ void buildEncoderPositionBiasKernel(T* position_bias, const int* sequence_lengths, const int max_seq_len, const int head_num) +__global__ void buildEncoderPositionBiasKernel(T* position_bias_src, T* position_bias_dst, const int* sequence_lengths, const int src_seq_len, const int tgt_seq_len, const int head_num) { // sequence_lengths: [batch_size] // position_bias: [batch_size, head_num, max_seq_len, max_seq_len] - position_bias += blockIdx.x * head_num * max_seq_len * max_seq_len; + position_bias_src += blockIdx.x * head_num * src_seq_len * tgt_seq_len; const int length = sequence_lengths[blockIdx.x]; - for (int i = threadIdx.x; i < head_num * max_seq_len * max_seq_len; i += blockDim.x) { + for (int i = threadIdx.x; i < head_num * src_seq_len * tgt_seq_len; i += blockDim.x) { // int row_id = i / max_seq_len; - int col_id = i % max_seq_len; + int col_id = i % tgt_seq_len; // if (row_id < length && col_id < length) { // TODO (bhsueh) check this modification is ok or not on other rmodel - if (col_id >= length) { - position_bias[i] = (T)(0.0f); + if (col_id < length) { + position_bias_dst[i] = position_bias_src[i]; } } } template void invokeBuildEncoderAttentionMask( - T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream) + T* attention_mask, const int* sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream) { - buildEncoderAttentionMaskKernel<<>>(attention_mask, sequence_lengths, max_seq_len); + buildEncoderAttentionMaskKernel<<>>(attention_mask, sequence_lengths, src_seq_len, tgt_seq_len); } template void invokeBuildEncoderPositionBias( - T* position_bias, const int* sequence_lengths, const int batch_size, const int max_seq_len, const int head_num, cudaStream_t stream) + T* position_bias_src, T* position_bias_dst, const int* sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, const int head_num, cudaStream_t stream) { - buildEncoderPositionBiasKernel<<>>(position_bias, sequence_lengths, max_seq_len, head_num); + cudaMemsetAsync(position_bias_dst, 0, batch_size * src_seq_len * tgt_seq_len * head_num * sizeof(T), stream); + buildEncoderPositionBiasKernel<<>>(position_bias_src, position_bias_dst, sequence_lengths, src_seq_len, tgt_seq_len, head_num); } template void invokeBuildEncoderAttentionMask(float* attention_mask, const int* sequence_lengths, const int batch_size, - const int max_seq_len, + const int src_seq_len, + const int tgt_seq_len, cudaStream_t stream); template void invokeBuildEncoderAttentionMask(half* attention_mask, const int* sequence_lengths, const int batch_size, - const int max_seq_len, + const int src_seq_len, + const int tgt_seq_len, cudaStream_t stream); -template void invokeBuildEncoderPositionBias(float* attention_mask, +template void invokeBuildEncoderPositionBias(float* position_bias_src, + float* position_bias_dst, const int* sequence_lengths, const int batch_size, - const int max_seq_len, + const int src_seq_len, + const int tgt_seq_len, const int head_num, cudaStream_t stream); template void invokeBuildEncoderPositionBias(half* attention_mask, + half* position_bias_dst, const int* sequence_lengths, const int batch_size, - const int max_seq_len, + const int src_seq_len, + const int tgt_seq_len, const int head_num, cudaStream_t stream); __global__ void getTrtPaddingOffsetKernel(int* trt_mha_padding_offset, const int* sequence_length, const int batch_size) diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.h b/src/fastertransformer/kernels/bert_preprocess_kernels.h index 947aac8..52d7ef6 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.h +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.h @@ -32,10 +32,10 @@ void invokeGetPaddingOffset(size_t* h_token_num, template void invokeBuildEncoderAttentionMask( - T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream); + T* attention_mask, const int* sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream); template void invokeBuildEncoderPositionBias( - T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, const int head_num, cudaStream_t stream); + T* position_bias_src, T* position_bias_dst, const int* sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, const int head_num, cudaStream_t stream); template void invokeBuildSequenceLength(const T * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream); diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index f11af31..dd1062c 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -1736,6 +1736,7 @@ template void invokeCrossAddFusedZP_QKVBiasTranspose(float* q_buf, const int h_token, const int h_token2, int *padding_mask, + int *padding_mask2, cudaStream_t stream); template void invokeCrossAddFusedZP_QKVBiasTranspose(half* q_buf, @@ -1751,6 +1752,7 @@ template void invokeCrossAddFusedZP_QKVBiasTranspose(half* q_buf, const int h_token, const int h_token2, int *padding_mask, + int *padding_mask2, cudaStream_t stream); @@ -2028,14 +2030,12 @@ void invokeCrossAddFusedZP_QKVBiasTranspose(T* q_buf, const int h_token, const int h_token2, int *padding_mask, + int *padding_mask2, cudaStream_t stream) { - const int size_q = batch_size * seq_len; const int m = h_token; const int n = head_num * size_per_head; cudaMemsetAsync(q_buf, 0, batch_size * seq_len * n * sizeof(T), stream); - cudaMemsetAsync(k_buf, 0, batch_size * tgt_seq_len * n * sizeof(T), stream); - cudaMemsetAsync(v_buf, 0, batch_size * tgt_seq_len * n * sizeof(T), stream); dim3 block(384); dim3 grid((int)(ceil(1.0 * m * n / 384))); add_fusedQKV_ZP_bias_transpose_kernel_q<<>>( @@ -2043,11 +2043,13 @@ void invokeCrossAddFusedZP_QKVBiasTranspose(T* q_buf, const int m2 = h_token2; const int n2 = head_num * size_per_head; + cudaMemsetAsync(k_buf, 0, batch_size * tgt_seq_len * n2 * sizeof(T), stream); + cudaMemsetAsync(v_buf, 0, batch_size * tgt_seq_len * n2 * sizeof(T), stream); dim3 block2(384); dim3 grid2((int)(ceil(1.0 * m2 * n2 / 384))); - qkv_bias = qkv_bias == nullptr ? qkv_bias : qkv_bias + n2; + qkv_bias = (qkv_bias == nullptr) ? nullptr : qkv_bias + n2; add_fusedQKV_ZP_bias_transpose_kernel_kv<<>>( - k_buf, v_buf, QKV + size_q * n, qkv_bias, batch_size, tgt_seq_len, head_num, size_per_head, h_token2, padding_mask); + k_buf, v_buf, QKV + m * n, qkv_bias, batch_size, tgt_seq_len, head_num, size_per_head, h_token2, padding_mask2); } template void invokeCrossAddFusedQKVBiasTranspose(float* q_buf, diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index 4dd1a45..92aa995 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -136,6 +136,7 @@ void invokeCrossAddFusedZP_QKVBiasTranspose(T* q_buf, const int h_token, const int h_token2, int *padding_mask, + int *padding_mask2, cudaStream_t stream); template diff --git a/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc index 8d56343..fc5aa29 100644 --- a/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc +++ b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc @@ -122,10 +122,11 @@ int MSELayer::forward(std::vector* output_tensors, (void*)encoder_weights->attention.attention_output_weight.kernel, (void*)encoder_weights->layernorm2.gamma, (void*)encoder_weights->encoder_output_mapping.kernel, - (void*)encoder_weights->encoder_output_projection.kernel + (void*)encoder_weights->encoder_output_projection.kernel, + (void*)input_tensors->at(3).data }; - forwardEncoder(inputs, 9, outputs, 1, &encoder_param_, buf_); + forwardEncoder(inputs, 10, outputs, 1, &encoder_param_, buf_); } else { void* inputs[] = {(void*)input_tensors->at(0).data, @@ -156,10 +157,11 @@ int MSELayer::forward(std::vector* output_tensors, (void*)encoder_weights->attention.attention_output_weight.kernel, (void*)encoder_weights->layernorm2.gamma, (void*)encoder_weights->encoder_output_mapping.kernel, - (void*)encoder_weights->encoder_output_projection.kernel + (void*)encoder_weights->encoder_output_projection.kernel, + (void*)input_tensors->at(3).data }; - forwardEncoder(inputs, 9, outputs, 1, &encoder_param_, buf_); + forwardEncoder(inputs, 10, outputs, 1, &encoder_param_, buf_); } else { void* inputs[] = {(void*)input_tensors->at(0).data, diff --git a/src/fastertransformer/layers/ms_layers/attention.cc b/src/fastertransformer/layers/ms_layers/attention.cc index 093c8db..6f29f24 100644 --- a/src/fastertransformer/layers/ms_layers/attention.cc +++ b/src/fastertransformer/layers/ms_layers/attention.cc @@ -22,6 +22,7 @@ size_t GetAttnWorkspaceSize(attentionParamRun* param) size_t qk_buf_len = param->common_param->batch_size * param->common_param->head_num * param->common_param->src_seq_len * param->common_param->tgt_seq_len; size_t qkv_buf_2_len = param->common_param->batch_size * param->common_param->src_seq_len * param->common_param->hidden_size; size_t qkv_buf_3_len = qkv_buf_2_len; + size_t position_bias_buf_len = (param->attn.is_cross) ? param->common_param->batch_size * param->common_param->src_seq_len * param->common_param->tgt_seq_len : qk_buf_len; OptAllocator allocator(ALIGN_SIZE); param->attn.qkv_buf = allocator.Malloc(qkv_len * sizeof(T)); param->attn.q_buf_2 = allocator.Malloc(q_buf_2_len * sizeof(T)); @@ -31,6 +32,8 @@ size_t GetAttnWorkspaceSize(attentionParamRun* param) param->attn.qk_buf = allocator.Malloc(qk_buf_len * sizeof(T)); allocator.Free(param->attn.q_buf_2); allocator.Free(param->attn.output1); + if(param->attn.position_bias) param->attn.position_bias_buf = allocator.Malloc(position_bias_buf_len * sizeof(T)); + allocator.Free(param->attn.position_bias_buf); param->attn.qkv_buf_2 = allocator.Malloc(qkv_buf_2_len * sizeof(T)); allocator.Free(param->attn.output2); allocator.Free(param->attn.qk_buf); @@ -45,7 +48,8 @@ template size_t GetAttnWorkspaceSize(attentionParamRun* param); template void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamRun* param, void* ws) -{ param->common_param->in_idx = 0; +{ + param->common_param->in_idx = 0; T* qkv_buf = reinterpret_cast(static_cast(ws) + param->attn.qkv_buf); T* q_buf_2 = reinterpret_cast(static_cast(ws) + param->attn.q_buf_2); T* qk_buf = reinterpret_cast(static_cast(ws) + param->attn.qk_buf); @@ -53,6 +57,7 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa T* qkv_buf_3 = reinterpret_cast(static_cast(ws) + param->attn.qkv_buf_3); T* output1 = reinterpret_cast(static_cast(ws) + param->attn.output1); T* output2 = reinterpret_cast(static_cast(ws) + param->attn.output2); + T* position_bias_compress = (param->attn.position_bias) ? reinterpret_cast(static_cast(ws) + param->attn.position_bias_buf) : nullptr; int gemm_dims[] = { 3 * (int)param->common_param->hidden_size, (int)param->common_param->h_token_num, (int)param->common_param->hidden_size}; int gemm_lds[] = {3 * (int)param->common_param->hidden_size, (int)param->common_param->hidden_size, 3 * (int)param->common_param->hidden_size}; @@ -72,7 +77,7 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa std::cout<<"param->attn.padding_offset == nullptr\n"; if (param->attn.is_cross) { gemm_dims[0] = param->common_param->hidden_size; - gemm_dims[1] = param->common_param->batch_size * param->common_param->src_seq_len; + gemm_dims[1] = param->common_param->h_token_num; gemm_dims[2] = param->common_param->hidden_size; gemm_lds[0] = param->common_param->hidden_size; gemm_lds[1] = param->common_param->hidden_size; @@ -90,15 +95,17 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa &beta, param->common_param->cublas_handle, param->common_param->algo); - + printTensor("qkv_buf",qkv_buf,10); + printTensor("encoder_output",encoder_output,10); + gemm_dims[0] = 2 * param->common_param->hidden_size; - gemm_dims[1] = param->common_param->batch_size * param->common_param->tgt_seq_len; + gemm_dims[1] = param->common_param->h_token_num2; gemm_lds[0] = 2 * param->common_param->hidden_size; gemm_lds[2] = 2 * param->common_param->hidden_size; T* weight_kv = reinterpret_cast(inputs[param->common_param->in_idx++]); CublasGemmWrapper(weight_kv, encoder_output, - qkv_buf + (param->common_param->batch_size * param->common_param->src_seq_len) * param->common_param->hidden_size, + qkv_buf + param->common_param->h_token_num * param->common_param->hidden_size, gemm_dims, gemm_lds, gemm_ops, @@ -107,6 +114,8 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa &beta, param->common_param->cublas_handle, param->common_param->algo); + printTensor("qkv_buf + (param->common_param->h_token_num) * param->common_param->hidden_size",qkv_buf + (param->common_param->h_token_num) * param->common_param->hidden_size,10); + T* bias_qkv = (param->attn.qkv_bias) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; if (param->attn.padding_offset == nullptr) { invokeCrossAddFusedQKVBiasTranspose(q_buf_2, @@ -122,8 +131,8 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa param->common_param->stream); } else{ - // std::cout<<"param->common_param->h_token_num"<common_param->h_token_num<common_param->tgt_seq_len"<common_param->tgt_seq_len<common_param->h_token_num"<common_param->h_token_num<common_param->tgt_seq_len"<common_param->tgt_seq_len<common_param->h_token_num, param->common_param->h_token_num2, param->attn.padding_offset, + param->attn.padding_offset2, param->common_param->stream); - param->common_param->h_token_num = param->common_param->h_token_num2; } + std::cout<<"param->common_param->h_token_num2: "<common_param->h_token_num2<(inputs[param->common_param->in_idx++]); @@ -184,6 +197,7 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa param->common_param->stream); } } + gemm_ops[0] = CUBLAS_OP_T; gemm_ops[1] = CUBLAS_OP_N; gemm_lds[0] = param->common_param->head_size; @@ -209,33 +223,39 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa param->common_param->batch_size * param->common_param->head_num, param->common_param->cublas_handle, param->common_param->algo); + printTensor("qk_buf",qk_buf,10); + std::cout<<"param->common_param->tgt_seq_len"<common_param->tgt_seq_len<attn.mask) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; // printTensor("attention_mask",attention_mask,1*128*128); T* position_bias = (param->attn.position_bias) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; if (param->attn.padding_offset != nullptr){ invokeBuildEncoderAttentionMask( - attention_mask, param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, param->common_param->stream); + attention_mask, (param->attn.is_cross) ? param->attn.d_sequence_length2 : param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, (param->attn.is_cross) ? param->common_param->tgt_seq_len : param->common_param->src_seq_len, param->common_param->stream); if (position_bias!= nullptr) { - if (param->attn.is_cross){ - invokeBuildEncoderAttentionMask( - position_bias, param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, param->common_param->stream); - } else { - invokeBuildEncoderPositionBias( - position_bias, param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, param->common_param->head_num, param->common_param->stream); - } + invokeBuildEncoderPositionBias(position_bias, + position_bias_compress, + (param->attn.is_cross) ? param->attn.d_sequence_length2 : param->attn.d_sequence_length, + param->common_param->batch_size, + param->common_param->src_seq_len, + (param->attn.is_cross) ? param->common_param->tgt_seq_len : param->common_param->src_seq_len, + (param->attn.is_cross) ? 1 : param->common_param->head_num, + param->common_param->stream); + } } - // printTensor("position_bias_invokeBuild",position_bias,8*128*128); + if(param->attn.is_cross) printTensor("position_bias_invokeBuild",position_bias,256*128); + if(param->attn.is_cross) printTensor("mask_invokeBuild",attention_mask,256*128); invokeMixMaskedSoftMax(static_cast(qk_buf), attention_mask, - position_bias, + position_bias_compress, param->common_param->batch_size, param->common_param->src_seq_len, param->common_param->tgt_seq_len, param->common_param->head_num, - (param->attn.is_cross && param->attn.position_bias) ? int(1) : int(param->common_param->head_num), + (param->attn.is_cross && param->attn.position_bias) ? 1 : int(param->common_param->head_num), (T)(param->attn.scale), param->common_param->stream); + printTensor("qk_bufMixMasked",qk_buf,10); // std::cout<<"param->attn.scale: "<attn.scale<(output[0]), (const T*)(inputs[param->common_param->in_idx++]), len, param->common_param->hidden_size, param->common_param->stream); } + return; } diff --git a/src/fastertransformer/layers/ms_layers/decoder.cc b/src/fastertransformer/layers/ms_layers/decoder.cc index 68f38a1..03d849f 100644 --- a/src/fastertransformer/layers/ms_layers/decoder.cc +++ b/src/fastertransformer/layers/ms_layers/decoder.cc @@ -39,7 +39,10 @@ size_t GetDecoderLayerWorkspaceSize(decoderParamRun* param) // param->attn1.attn.padding_offset=nullptr; // param->attn1.attn.d_sequence_length2=nullptr; // param->attn1.attn.padding_offset2=nullptr; + int tmp =param->common_param.tgt_seq_len; + param->common_param.tgt_seq_len = param->common_param.src_seq_len; param->decoder.attn_ws_buf = allocator.Malloc(GetAttnWorkspaceSize(&(param->attn1))); + param->common_param.tgt_seq_len = tmp; param->decoder.attn_out_buf = allocator.Malloc(attn_out_size * sizeof(T)); allocator.Free(param->decoder.attn_ws_buf); if (!param->decoder.layernorm_post) @@ -71,18 +74,18 @@ size_t GetDecoderLayerWorkspaceSize(decoderParamRun* param) template size_t GetDecoderLayerWorkspaceSize(decoderParamRun* param); template size_t GetDecoderLayerWorkspaceSize(decoderParamRun* param); template -void GetCompressBuffer(T* compress_buffer, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, decoderParamRun* param) +void GetCompressBuffer(T* compress_buffer, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, decoderParamRun* param) { invokeBuildSequenceLength( - input_ids, param->common_param.batch_size, d_sequence_lengths, param->common_param.src_seq_len, param->common_param.stream); + input_ids, param->common_param.batch_size, d_sequence_lengths, seq_len, param->common_param.stream); invokeGetPaddingOffset(&h_token_num, d_token_num, padding_offset, d_sequence_lengths, param->common_param.batch_size, - param->common_param.src_seq_len, + seq_len, param->common_param.stream); - if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { + if (h_token_num * 2 <= param->common_param.batch_size * seq_len) { param->common_param.eft = true; invokeRemovePadding(compress_buffer, (const T*)from_tensor, @@ -97,11 +100,13 @@ template void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamRun* param, void* ws) { param->common_param.in_idx = 0; - size_t h_token_num = param->common_param.batch_size * param->common_param.src_seq_len; - size_t h_token_num2 = param->common_param.h_token_num = param->common_param.h_token_num2 = h_token_num; + size_t h_token_num = param->common_param.h_token_num = param->common_param.batch_size * param->common_param.src_seq_len; + size_t h_token_num2 = param->common_param.h_token_num2 = param->common_param.batch_size * param->common_param.tgt_seq_len; param->decoder.padding_offset = nullptr; int* d_sequence_lengths = nullptr; int* d_sequence_lengths2 = nullptr; + int* padding_offset = nullptr; + int* padding_offset2= nullptr; T* input_tensor = reinterpret_cast(inputs[param->common_param.in_idx++]); T* from_tensor = input_tensor; int idx_encoder_out = param->attn1.attn.position_bias ? 7 : 10; @@ -110,37 +115,41 @@ void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, dec T* compress_buffer2; int *input_ids = reinterpret_cast(inputs[in_len-1]); int *input_ids2 = reinterpret_cast(inputs[in_len-2]); + printTensor("input_ids2",(int*)(input_ids2),1*128); + compress_buffer = reinterpret_cast(static_cast(ws) + param->decoder.compress_buf); compress_buffer2 = reinterpret_cast(static_cast(ws) + param->decoder.compress_buf2); - int* padding_offset = reinterpret_cast(static_cast(ws) + param->decoder.padding_offset_buf); - int* padding_offset2 = reinterpret_cast(static_cast(ws) + param->decoder.padding_offset_buf2); + padding_offset = reinterpret_cast(static_cast(ws) + param->decoder.padding_offset_buf); + padding_offset2 = reinterpret_cast(static_cast(ws) + param->decoder.padding_offset_buf2); d_sequence_lengths = reinterpret_cast(static_cast(ws) + param->decoder.d_sequence_lengths_offset_buf); d_sequence_lengths2 = reinterpret_cast(static_cast(ws) + param->decoder.d_sequence_lengths_offset_buf2); - param->decoder.d_sequence_length = d_sequence_lengths; - param->decoder.d_sequence_length2 = d_sequence_lengths2; + // param->decoder.d_sequence_length = d_sequence_lengths; + // param->decoder.d_sequence_length2 = d_sequence_lengths2; size_t* d_token_num = reinterpret_cast(static_cast(ws) + param->decoder.d_token_num_buf); size_t* d_token_num2 = reinterpret_cast(static_cast(ws) + param->decoder.d_token_num_buf2); - // param->decoder.eft = false; - // printTensor("input_ids",(int*)(input_ids),param->common_param.src_seq_len); std::cout<<"param->common_param.src_seq_len:"<common_param.src_seq_len<common_param.head_num * param->common_param.head_size"<common_param.head_num * param->common_param.head_size<common_param.src_seq_len, param); if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { param->common_param.h_token_num = h_token_num; - param->decoder.padding_offset = padding_offset; + // param->decoder.padding_offset = padding_offset; from_tensor = compress_buffer; } - GetCompressBuffer(compress_buffer2, encoder_output, input_ids2, padding_offset2, d_sequence_lengths2, h_token_num2, d_token_num2, param); - if (h_token_num2 * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { + GetCompressBuffer(compress_buffer2, encoder_output, input_ids2, padding_offset2, d_sequence_lengths2, h_token_num2, d_token_num2, param->common_param.tgt_seq_len, param); + if (h_token_num2 * 2 <= param->common_param.batch_size * param->common_param.tgt_seq_len) { param->common_param.h_token_num2 = h_token_num2; - param->decoder.padding_offset2 = padding_offset2; + // ppadding_offset2 = padding_offset2; inputs[idx_encoder_out] = compress_buffer2; } + else{ + padding_offset =nullptr; +padding_offset2=nullptr; + } - if(param->decoder.padding_offset != nullptr) - std::cout<<"param->decoder.padding_offset != nullptr\n"; - if(param->decoder.padding_offset == nullptr) - std::cout<<"param->decoder.padding_offset == nullptr\n"; + if(padding_offset2 != nullptr) + std::cout<<"param->decoder.padding_offset2 != nullptr\n"; + if(padding_offset2 == nullptr) + std::cout<<"param->decoder.padding_offset2 == nullptr\n"; h_token_num = param->common_param.h_token_num; h_token_num2 = param->common_param.h_token_num2; T* attn_out = reinterpret_cast(static_cast(ws) + param->decoder.attn_out_buf); @@ -152,17 +161,18 @@ std::cout<<"param->common_param.head_num * param->common_param.head_size"<(static_cast(ws) + param->decoder.normed_attn2_out_buf); T* ffn_ws = reinterpret_cast(static_cast(ws) + param->decoder.ffn_ws_buf); T* tmp_out = reinterpret_cast(output[0]); - if ((param->decoder.padding_offset != nullptr || std::is_same::value && param->ffn_param.ffn_param.ffn_fp16 == true)) { + if ((padding_offset != nullptr || std::is_same::value && param->ffn_param.ffn_param.ffn_fp16 == true)) { tmp_out = reinterpret_cast(static_cast(ws) + param->decoder.tmp_out_buf); } T* tmp_out1 = reinterpret_cast(output[0]); T* out_buf = tmp_out; - if (param->decoder.padding_offset != nullptr) { + if (padding_offset != nullptr) { tmp_out1 = compress_buffer2; } T* gamma1 = reinterpret_cast(inputs[param->common_param.in_idx++]); T* beta1 = (param->decoder.has_beta) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; std::cout<<"h_token_num decoder"<(from_tensor), gamma1, @@ -171,7 +181,7 @@ std::cout<<"param->common_param.head_num * param->common_param.head_size"<common_param.hidden_size, param->common_param.stream, param->decoder.eps1); -printTensor("from_tensoe",from_tensor,10); +printTensor("encoder_output",encoder_output,10); printTensor("normed_from_tensor",normed_from_tensor,10); inputs[--param->common_param.in_idx] = normed_from_tensor; @@ -179,12 +189,13 @@ printTensor("from_tensoe",from_tensor,10); // if attention is embedded inside an decoder - fuse the bias to next layer normalization bool is_projection_bias = param->attn1.attn.projection_bias; param->attn1.attn.projection_bias = false; - param->attn1.attn.d_sequence_length = param->decoder.d_sequence_length; - param->attn1.attn.padding_offset = param->decoder.padding_offset; - + param->attn1.attn.d_sequence_length = d_sequence_lengths; + param->attn1.attn.padding_offset = padding_offset; + int tmp = param->common_param.tgt_seq_len; + param->common_param.tgt_seq_len = param->common_param.src_seq_len; forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws); - + param->common_param.tgt_seq_len = tmp; param->attn1.attn.projection_bias = is_projection_bias; param->common_param.in_idx = param->common_param.in_idx + in_idx; T* projection_bias = @@ -208,15 +219,16 @@ printTensor("normed_attn_out",normed_attn_out,10); in_idx = param->common_param.in_idx; is_projection_bias = param->attn2.attn.projection_bias; param->attn2.attn.projection_bias = false; - param->attn2.attn.d_sequence_length = param->decoder.d_sequence_length; - param->attn2.attn.padding_offset = param->decoder.padding_offset; - param->attn1.attn.d_sequence_length2 = param->decoder.d_sequence_length2; - param->attn1.attn.padding_offset2 = param->decoder.padding_offset2; + param->attn2.attn.d_sequence_length = d_sequence_lengths; + param->attn2.attn.padding_offset = padding_offset; + param->attn2.attn.d_sequence_length2 = d_sequence_lengths2; + param->attn2.attn.padding_offset2 = padding_offset2; // printTensor("inputs[param->common_param.in_idx]",(T*)(inputs[param->common_param.in_idx+1]),param->common_param.src_seq_len*param->common_param.head_num * param->common_param.head_size); forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws); - - h_token_num = h_token_num2; + // param->decoder.d_sequence_length = param->attn2.attn.d_sequence_length; + // param->decoder.padding_offset = param->attn2.attn.padding_offset; + // h_token_num = h_token_num2; param->attn2.attn.projection_bias = is_projection_bias; param->common_param.in_idx = param->common_param.in_idx + in_idx; T* projection_bias2 = @@ -250,7 +262,7 @@ printTensor("normed_attn_out",normed_attn_out,10); param->decoder.eps3); } inputs[--param->common_param.in_idx] = normed_attn2_out; - printTensor("normed_attn2_out",normed_attn2_out,10); + printTensor("attn2_out",attn2_out,10); if (param->ffn_param.ffn_param.ffn_fp16 == false) { forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, &(param->ffn_param), ffn_ws); } @@ -292,13 +304,13 @@ printTensor("normed_attn_out",normed_attn_out,10); } printTensor("out_buf",out_buf,10); - if (param->decoder.padding_offset2 != nullptr) { + if (padding_offset != nullptr) { cudaMemsetAsync(output[0], 0, param->common_param.batch_size * param->common_param.src_seq_len * param->common_param.head_size * param->common_param.head_num * sizeof(T), param->common_param.stream); invokeRebuildPadding( - (T*)output[0], out_buf, param->decoder.padding_offset2, h_token_num, param->common_param.hidden_size, param->common_param.stream); + (T*)output[0], out_buf, padding_offset, h_token_num, param->common_param.hidden_size, param->common_param.stream); } // printTensor("output[0]",(T*)output[0],param->common_param.src_seq_len*param->common_param.head_num * param->common_param.head_size); return; @@ -308,6 +320,6 @@ template void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamRun* param, void* ws); template void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamRun* param, void* ws); -template void GetCompressBuffer(float* compress_buffer, float* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, decoderParamRun* param); -template void GetCompressBuffer(half* compress_buffer, half* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, decoderParamRun* param); +template void GetCompressBuffer(float* compress_buffer, float* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, decoderParamRun* param); +template void GetCompressBuffer(half* compress_buffer, half* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, decoderParamRun* param); } // namespace fastertransformer diff --git a/src/fastertransformer/layers/ms_layers/decoder.h b/src/fastertransformer/layers/ms_layers/decoder.h index 889f7d9..61af13a 100644 --- a/src/fastertransformer/layers/ms_layers/decoder.h +++ b/src/fastertransformer/layers/ms_layers/decoder.h @@ -13,5 +13,5 @@ size_t GetDecoderLayerWorkspaceSize(decoderParamRun* param); template void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamRun* param, void* ws); template -void GetCompressBuffer(T* compress_buffer2, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, decoderParamRun* param); +void GetCompressBuffer(T* compress_buffer2, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, decoderParamRun* param); } // namespace fastertransformer diff --git a/src/fastertransformer/layers/ms_layers/encoder.cc b/src/fastertransformer/layers/ms_layers/encoder.cc index 52d563b..6a6fb9a 100644 --- a/src/fastertransformer/layers/ms_layers/encoder.cc +++ b/src/fastertransformer/layers/ms_layers/encoder.cc @@ -60,18 +60,18 @@ size_t GetEncoderLayerWorkspaceSize(encoderParamRun* param) // return 0; } template -void GetCompressBuffer(T* compress_buffer, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, encoderParamRun* param) +void GetCompressBuffer(T* compress_buffer, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, encoderParamRun* param) { invokeBuildSequenceLength( - input_ids, param->common_param.batch_size, d_sequence_lengths, param->common_param.src_seq_len, param->common_param.stream); + input_ids, param->common_param.batch_size, d_sequence_lengths, seq_len, param->common_param.stream); invokeGetPaddingOffset(&h_token_num, d_token_num, padding_offset, d_sequence_lengths, param->common_param.batch_size, - param->common_param.src_seq_len, + seq_len, param->common_param.stream); - if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { + if (h_token_num * 2 <= param->common_param.batch_size * seq_len) { param->common_param.eft = true; invokeRemovePadding(compress_buffer, (const T*)from_tensor, @@ -90,15 +90,16 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc size_t h_token_num = param->common_param.batch_size * param->common_param.src_seq_len; param->common_param.h_token_num = h_token_num; param->encoder.padding_offset = nullptr; + int* padding_offset = nullptr; int* d_sequence_lengths = nullptr; T* input_tensor = reinterpret_cast(inputs[param->common_param.in_idx++]); T* from_tensor = input_tensor; T* compress_buffer; int *input_ids = reinterpret_cast(inputs[in_len-1]); compress_buffer = reinterpret_cast(static_cast(ws) + param->encoder.compress_buf); - int* padding_offset = reinterpret_cast(static_cast(ws) + param->encoder.padding_offset_buf); + padding_offset = reinterpret_cast(static_cast(ws) + param->encoder.padding_offset_buf); d_sequence_lengths = reinterpret_cast(static_cast(ws) + param->encoder.d_sequence_lengths_offset_buf); - param->encoder.d_sequence_length = d_sequence_lengths; + // param->encoder.d_sequence_length = d_sequence_lengths; size_t* d_token_num = reinterpret_cast(static_cast(ws) + param->encoder.d_token_num_buf); param->common_param.eft = false; // printTensor("from_tensor",(T*)(from_tensor),10); @@ -124,18 +125,18 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc // param->encoder.padding_offset = padding_offset; // from_tensor = compress_buffer; // } - GetCompressBuffer(compress_buffer, from_tensor, input_ids, padding_offset, d_sequence_lengths, h_token_num, d_token_num, param); + GetCompressBuffer(compress_buffer, from_tensor, input_ids, padding_offset, d_sequence_lengths, h_token_num, d_token_num, param->common_param.src_seq_len, param); if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { param->common_param.h_token_num = h_token_num; - param->encoder.padding_offset = padding_offset; + // param->encoder.padding_offset = padding_offset; from_tensor = compress_buffer; } // std::cout<common_param.src_seq_len*512); - if(param->encoder.padding_offset != nullptr) + if(padding_offset != nullptr) std::cout<<"param->encoder.padding_offset != nullptr\n"; - if(param->encoder.padding_offset == nullptr) + if(padding_offset == nullptr) std::cout<<"param->encoder.padding_offset == nullptr\n"; h_token_num = param->common_param.h_token_num; std::cout<<"h_token_num: "<(static_cast(ws) + param->encoder.normed_attn_out_buf); T* ffn_ws = reinterpret_cast(static_cast(ws) + param->encoder.ffn_ws_buf); T* tmp_out = reinterpret_cast(output[0]); - if (param->encoder.padding_offset != nullptr || (std::is_same::value && param->ffn_param.ffn_param.ffn_fp16 == true)) { + if (padding_offset != nullptr || (std::is_same::value && param->ffn_param.ffn_param.ffn_fp16 == true)) { tmp_out = reinterpret_cast(static_cast(ws) + param->encoder.tmp_out_buf); } T* tmp_out1 = reinterpret_cast(output[0]); T* out_buf = tmp_out; - if (param->encoder.padding_offset != nullptr) { + if (padding_offset != nullptr) { tmp_out1 = compress_buffer; } if (param->encoder.layernorm_post == false || param->attn.attn.position_bias) { @@ -176,8 +177,8 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc bool is_projection_bias = param->attn.attn.projection_bias; param->attn.attn.projection_bias = false; int in_idx = param->common_param.in_idx; - param->attn.attn.d_sequence_length = param->encoder.d_sequence_length; - param->attn.attn.padding_offset = param->encoder.padding_offset; + param->attn.attn.d_sequence_length = d_sequence_lengths; + param->attn.attn.padding_offset = padding_offset; forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn_out, 1, &(param->attn), attn_ws); // printTensor("out\n",(T*)(attn_out),10); @@ -317,19 +318,18 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc } } - if (param->encoder.padding_offset != nullptr) { + if (padding_offset != nullptr) { cudaMemsetAsync(output[0], 0, param->common_param.batch_size * param->common_param.src_seq_len * param->common_param.head_size * param->common_param.head_num * sizeof(T), param->common_param.stream); invokeRebuildPadding( - (T*)output[0], out_buf, param->encoder.padding_offset, h_token_num, param->common_param.hidden_size, param->common_param.stream); + (T*)output[0], out_buf, padding_offset, h_token_num, param->common_param.hidden_size, param->common_param.stream); } // std::cout<encoder.padding_offset != nullptr) + if(padding_offset != nullptr) std::cout<<"param->encoder.padding_offset != nullptr\n"; - if(param->encoder.padding_offset == nullptr) + if(padding_offset == nullptr) std::cout<<"param->encoder.padding_offset == nullptr\n"; return; } @@ -338,7 +338,7 @@ template void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamRun* param, void* ws); template void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamRun* param, void* ws); -template void GetCompressBuffer(float* compress_buffer, float* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, encoderParamRun* param); -template void GetCompressBuffer(half* compress_buffer, half* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, encoderParamRun* param); +template void GetCompressBuffer(float* compress_buffer, float* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, encoderParamRun* param); +template void GetCompressBuffer(half* compress_buffer, half* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, encoderParamRun* param); } // namespace fastertransformer diff --git a/src/fastertransformer/layers/ms_layers/encoder.h b/src/fastertransformer/layers/ms_layers/encoder.h index cb9e08a..d4f4b03 100644 --- a/src/fastertransformer/layers/ms_layers/encoder.h +++ b/src/fastertransformer/layers/ms_layers/encoder.h @@ -13,6 +13,6 @@ size_t GetEncoderLayerWorkspaceSize(encoderParamRun* param); template void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamRun* param, void* ws); template -void GetCompressBuffer(T* compress_buffer2, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, encoderParamRun* param); +void GetCompressBuffer(T* compress_buffer2, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, encoderParamRun* param); } // namespace fastertransformer diff --git a/src/fastertransformer/layers/ms_layers/param.h b/src/fastertransformer/layers/ms_layers/param.h index 1d0a645..a103db3 100644 --- a/src/fastertransformer/layers/ms_layers/param.h +++ b/src/fastertransformer/layers/ms_layers/param.h @@ -60,6 +60,7 @@ typedef struct { size_t qk_buf; size_t qkv_buf_2; size_t qkv_buf_3; + size_t position_bias_buf; bool mask; int* padding_offset; int* d_sequence_length; diff --git a/src/fastertransformer/models/bert/Bert.cc b/src/fastertransformer/models/bert/Bert.cc index ac727df..e0c056e 100644 --- a/src/fastertransformer/models/bert/Bert.cc +++ b/src/fastertransformer/models/bert/Bert.cc @@ -255,7 +255,7 @@ void Bert::forward(std::vector* output_tensors, switch (attention_type_) { case AttentionType::UNFUSED_MHA: { invokeBuildEncoderAttentionMask( - attention_mask_, sequence_lengths, request_batch_size, request_seq_len, stream_); + attention_mask_, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); sync_check_cuda_error(); invokeGetPaddingOffset(&h_token_num, token_num_, @@ -281,7 +281,7 @@ void Bert::forward(std::vector* output_tensors, } case AttentionType::UNFUSED_PADDED_MHA: { invokeBuildEncoderAttentionMask( - attention_mask_, sequence_lengths, request_batch_size, request_seq_len, stream_); + attention_mask_, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); sync_check_cuda_error(); h_token_num = request_batch_size * request_seq_len; bert_input_ptr = (T*)input_tensors->at(0).data; diff --git a/src/fastertransformer/models/bert_int8/BertINT8.cc b/src/fastertransformer/models/bert_int8/BertINT8.cc index 7c6347b..687fcb8 100644 --- a/src/fastertransformer/models/bert_int8/BertINT8.cc +++ b/src/fastertransformer/models/bert_int8/BertINT8.cc @@ -180,7 +180,7 @@ void BertINT8::forward(std::vector* output_tensors, switch (attention_type_) { case AttentionType::UNFUSED_MHA: { invokeBuildEncoderAttentionMask( - attention_mask_, sequence_lengths, request_batch_size, request_seq_len, stream_); + attention_mask_, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); sync_check_cuda_error(); invokeGetPaddingOffset(&h_token_num, token_num_, @@ -206,7 +206,7 @@ void BertINT8::forward(std::vector* output_tensors, } case AttentionType::UNFUSED_PADDED_MHA: { invokeBuildEncoderAttentionMask( - attention_mask_, sequence_lengths, request_batch_size, request_seq_len, stream_); + attention_mask_, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); sync_check_cuda_error(); h_token_num = request_batch_size * request_seq_len; bert_input_ptr = (T*)input_tensors->at(0).data; diff --git a/src/fastertransformer/models/vit/ViT.cc b/src/fastertransformer/models/vit/ViT.cc index e785f2b..9fc6585 100644 --- a/src/fastertransformer/models/vit/ViT.cc +++ b/src/fastertransformer/models/vit/ViT.cc @@ -415,7 +415,7 @@ bool ViTTransformer::setSeqLenVec(size_t batch_size) template void ViTTransformer::setDefaultMask(size_t batch_size) { - invokeBuildEncoderAttentionMask(mask_buf_, seq_len_vec_, batch_size, max_seq_len_, stream_); + invokeBuildEncoderAttentionMask(mask_buf_, seq_len_vec_, batch_size, max_seq_len_, max_seq_len_, stream_); } template diff --git a/src/fastertransformer/models/vit_int8/ViTINT8.cc b/src/fastertransformer/models/vit_int8/ViTINT8.cc index f610785..d00d6d1 100644 --- a/src/fastertransformer/models/vit_int8/ViTINT8.cc +++ b/src/fastertransformer/models/vit_int8/ViTINT8.cc @@ -462,7 +462,7 @@ bool ViTTransformerINT8::setSeqLenVec(size_t batch_size) template void ViTTransformerINT8::setDefaultMask(size_t batch_size) { - invokeBuildEncoderAttentionMask(mask_buf_, seq_len_vec_, batch_size, max_seq_len_, stream_); + invokeBuildEncoderAttentionMask(mask_buf_, seq_len_vec_, batch_size, max_seq_len_, max_seq_len_, stream_); } template -- Gitee From 8f4fb413b7db6a26506415d59adb94dddfebe3ad Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 8 Feb 2023 11:43:57 +0200 Subject: [PATCH 3/8] fix vsl --- .../kernels/bert_preprocess_kernels.cu | 38 +++++---- .../kernels/bert_preprocess_kernels.h | 4 +- .../layers/ms_layers/attention.cc | 47 ++--------- .../layers/ms_layers/debug_utils.cc | 77 +++++++++++++++++ .../layers/ms_layers/debug_utils.h | 3 + .../layers/ms_layers/decoder.cc | 81 +++++------------- .../layers/ms_layers/encoder.cc | 83 ++++--------------- .../layers/ms_layers/param.h | 8 +- src/fastertransformer/models/bert/Bert.cc | 4 +- .../models/bert_int8/BertINT8.cc | 4 +- src/fastertransformer/models/vit/ViT.cc | 2 +- .../models/vit_int8/ViTINT8.cc | 2 +- 12 files changed, 152 insertions(+), 201 deletions(-) diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu index 61e4fc1..dbfd2f5 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu @@ -101,36 +101,38 @@ __global__ void buildSequnceLength(const int *input, int *sequence_length, const template -__global__ void buildEncoderAttentionMaskKernel(T* attention_mask, const int* sequence_lengths, const int src_seq_len, const int tgt_seq_len) +__global__ void buildEncoderAttentionMaskKernel(T* attention_mask, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int src_seq_len, const int tgt_seq_len) { // sequence_lengths: [batch_size] // attention_mask: [batch_size, 1, max_seq_len, max_seq_len] attention_mask += blockIdx.x * src_seq_len * tgt_seq_len; - const int length = sequence_lengths[blockIdx.x]; + const int q_length = q_sequence_lengths[blockIdx.x]; + const int kv_length = kv_sequence_lengths[blockIdx.x]; for (int i = threadIdx.x; i < src_seq_len * tgt_seq_len; i += blockDim.x) { - // int row_id = i / max_seq_len; + int row_id = i / tgt_seq_len; int col_id = i % tgt_seq_len; // if (row_id < length && col_id < length) { // TODO (bhsueh) check this modification is ok or not on other rmodel - if (col_id >= length) { + if (col_id >= q_length || row_id >= kv_length) { attention_mask[i] = (T)(0.0f); } } } template -__global__ void buildEncoderPositionBiasKernel(T* position_bias_src, T* position_bias_dst, const int* sequence_lengths, const int src_seq_len, const int tgt_seq_len, const int head_num) +__global__ void buildEncoderPositionBiasKernel(T* position_bias_src, T* position_bias_dst, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int src_seq_len, const int tgt_seq_len, const int head_num) { // sequence_lengths: [batch_size] // position_bias: [batch_size, head_num, max_seq_len, max_seq_len] position_bias_src += blockIdx.x * head_num * src_seq_len * tgt_seq_len; - const int length = sequence_lengths[blockIdx.x]; + const int q_length = q_sequence_lengths[blockIdx.x]; + const int kv_length = kv_sequence_lengths[blockIdx.x]; for (int i = threadIdx.x; i < head_num * src_seq_len * tgt_seq_len; i += blockDim.x) { - // int row_id = i / max_seq_len; + int row_id = i / src_seq_len; int col_id = i % tgt_seq_len; // if (row_id < length && col_id < length) { // TODO (bhsueh) check this modification is ok or not on other rmodel - if (col_id < length) { + if (col_id < q_length && row_id < kv_length) { position_bias_dst[i] = position_bias_src[i]; } } @@ -138,33 +140,36 @@ __global__ void buildEncoderPositionBiasKernel(T* position_bias_src, T* position template void invokeBuildEncoderAttentionMask( - T* attention_mask, const int* sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream) + T* attention_mask, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream) { - buildEncoderAttentionMaskKernel<<>>(attention_mask, sequence_lengths, src_seq_len, tgt_seq_len); + buildEncoderAttentionMaskKernel<<>>(attention_mask, q_sequence_lengths, kv_sequence_lengths, src_seq_len, tgt_seq_len); } template void invokeBuildEncoderPositionBias( - T* position_bias_src, T* position_bias_dst, const int* sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, const int head_num, cudaStream_t stream) + T* position_bias_src, T* position_bias_dst, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, const int head_num, cudaStream_t stream) { cudaMemsetAsync(position_bias_dst, 0, batch_size * src_seq_len * tgt_seq_len * head_num * sizeof(T), stream); - buildEncoderPositionBiasKernel<<>>(position_bias_src, position_bias_dst, sequence_lengths, src_seq_len, tgt_seq_len, head_num); + buildEncoderPositionBiasKernel<<>>(position_bias_src, position_bias_dst, q_sequence_lengths, kv_sequence_lengths, src_seq_len, tgt_seq_len, head_num); } template void invokeBuildEncoderAttentionMask(float* attention_mask, - const int* sequence_lengths, + const int* q_sequence_lengths, + const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream); template void invokeBuildEncoderAttentionMask(half* attention_mask, - const int* sequence_lengths, + const int* q_sequence_lengths, + const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream); template void invokeBuildEncoderPositionBias(float* position_bias_src, float* position_bias_dst, - const int* sequence_lengths, + const int* q_sequence_lengths, + const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, @@ -172,7 +177,8 @@ template void invokeBuildEncoderPositionBias(float* position_bias_src, cudaStream_t stream); template void invokeBuildEncoderPositionBias(half* attention_mask, half* position_bias_dst, - const int* sequence_lengths, + const int* q_sequence_lengths, + const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.h b/src/fastertransformer/kernels/bert_preprocess_kernels.h index 52d7ef6..da1f79e 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.h +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.h @@ -32,10 +32,10 @@ void invokeGetPaddingOffset(size_t* h_token_num, template void invokeBuildEncoderAttentionMask( - T* attention_mask, const int* sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream); + T* attention_mask, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream); template void invokeBuildEncoderPositionBias( - T* position_bias_src, T* position_bias_dst, const int* sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, const int head_num, cudaStream_t stream); + T* position_bias_src, T* position_bias_dst, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, const int head_num, cudaStream_t stream); template void invokeBuildSequenceLength(const T * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream); diff --git a/src/fastertransformer/layers/ms_layers/attention.cc b/src/fastertransformer/layers/ms_layers/attention.cc index 6f29f24..4df6b20 100644 --- a/src/fastertransformer/layers/ms_layers/attention.cc +++ b/src/fastertransformer/layers/ms_layers/attention.cc @@ -22,7 +22,6 @@ size_t GetAttnWorkspaceSize(attentionParamRun* param) size_t qk_buf_len = param->common_param->batch_size * param->common_param->head_num * param->common_param->src_seq_len * param->common_param->tgt_seq_len; size_t qkv_buf_2_len = param->common_param->batch_size * param->common_param->src_seq_len * param->common_param->hidden_size; size_t qkv_buf_3_len = qkv_buf_2_len; - size_t position_bias_buf_len = (param->attn.is_cross) ? param->common_param->batch_size * param->common_param->src_seq_len * param->common_param->tgt_seq_len : qk_buf_len; OptAllocator allocator(ALIGN_SIZE); param->attn.qkv_buf = allocator.Malloc(qkv_len * sizeof(T)); param->attn.q_buf_2 = allocator.Malloc(q_buf_2_len * sizeof(T)); @@ -32,8 +31,6 @@ size_t GetAttnWorkspaceSize(attentionParamRun* param) param->attn.qk_buf = allocator.Malloc(qk_buf_len * sizeof(T)); allocator.Free(param->attn.q_buf_2); allocator.Free(param->attn.output1); - if(param->attn.position_bias) param->attn.position_bias_buf = allocator.Malloc(position_bias_buf_len * sizeof(T)); - allocator.Free(param->attn.position_bias_buf); param->attn.qkv_buf_2 = allocator.Malloc(qkv_buf_2_len * sizeof(T)); allocator.Free(param->attn.output2); allocator.Free(param->attn.qk_buf); @@ -57,7 +54,6 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa T* qkv_buf_3 = reinterpret_cast(static_cast(ws) + param->attn.qkv_buf_3); T* output1 = reinterpret_cast(static_cast(ws) + param->attn.output1); T* output2 = reinterpret_cast(static_cast(ws) + param->attn.output2); - T* position_bias_compress = (param->attn.position_bias) ? reinterpret_cast(static_cast(ws) + param->attn.position_bias_buf) : nullptr; int gemm_dims[] = { 3 * (int)param->common_param->hidden_size, (int)param->common_param->h_token_num, (int)param->common_param->hidden_size}; int gemm_lds[] = {3 * (int)param->common_param->hidden_size, (int)param->common_param->hidden_size, 3 * (int)param->common_param->hidden_size}; @@ -71,10 +67,6 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa } T alpha = 1.0f; T beta = 0.0f; - if(param->attn.padding_offset != nullptr) - std::cout<<"param->attn.padding_offset != nullptr\n"; - if(param->attn.padding_offset == nullptr) - std::cout<<"param->attn.padding_offset == nullptr\n"; if (param->attn.is_cross) { gemm_dims[0] = param->common_param->hidden_size; gemm_dims[1] = param->common_param->h_token_num; @@ -95,13 +87,14 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa &beta, param->common_param->cublas_handle, param->common_param->algo); - printTensor("qkv_buf",qkv_buf,10); - printTensor("encoder_output",encoder_output,10); - gemm_dims[0] = 2 * param->common_param->hidden_size; gemm_dims[1] = param->common_param->h_token_num2; + gemm_dims[2] = param->common_param->hidden_size; + gemm_lds[0] = 2 * param->common_param->hidden_size; + gemm_lds[1] = param->common_param->hidden_size; gemm_lds[2] = 2 * param->common_param->hidden_size; + T* weight_kv = reinterpret_cast(inputs[param->common_param->in_idx++]); CublasGemmWrapper(weight_kv, encoder_output, @@ -114,8 +107,6 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa &beta, param->common_param->cublas_handle, param->common_param->algo); - printTensor("qkv_buf + (param->common_param->h_token_num) * param->common_param->hidden_size",qkv_buf + (param->common_param->h_token_num) * param->common_param->hidden_size,10); - T* bias_qkv = (param->attn.qkv_bias) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; if (param->attn.padding_offset == nullptr) { invokeCrossAddFusedQKVBiasTranspose(q_buf_2, @@ -131,9 +122,6 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa param->common_param->stream); } else{ - std::cout<<"param->common_param->h_token_num"<common_param->h_token_num<common_param->tgt_seq_len"<common_param->tgt_seq_len<attn.padding_offset2, param->common_param->stream); } - std::cout<<"param->common_param->h_token_num2: "<common_param->h_token_num2<(inputs[param->common_param->in_idx++]); @@ -223,31 +207,15 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa param->common_param->batch_size * param->common_param->head_num, param->common_param->cublas_handle, param->common_param->algo); - printTensor("qk_buf",qk_buf,10); - std::cout<<"param->common_param->tgt_seq_len"<common_param->tgt_seq_len<attn.mask) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; - // printTensor("attention_mask",attention_mask,1*128*128); T* position_bias = (param->attn.position_bias) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; if (param->attn.padding_offset != nullptr){ invokeBuildEncoderAttentionMask( - attention_mask, (param->attn.is_cross) ? param->attn.d_sequence_length2 : param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, (param->attn.is_cross) ? param->common_param->tgt_seq_len : param->common_param->src_seq_len, param->common_param->stream); - if (position_bias!= nullptr) { - invokeBuildEncoderPositionBias(position_bias, - position_bias_compress, - (param->attn.is_cross) ? param->attn.d_sequence_length2 : param->attn.d_sequence_length, - param->common_param->batch_size, - param->common_param->src_seq_len, - (param->attn.is_cross) ? param->common_param->tgt_seq_len : param->common_param->src_seq_len, - (param->attn.is_cross) ? 1 : param->common_param->head_num, - param->common_param->stream); - + attention_mask, param->attn.d_sequence_length2, param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, param->common_param->tgt_seq_len, param->common_param->stream); } - } - if(param->attn.is_cross) printTensor("position_bias_invokeBuild",position_bias,256*128); - if(param->attn.is_cross) printTensor("mask_invokeBuild",attention_mask,256*128); invokeMixMaskedSoftMax(static_cast(qk_buf), attention_mask, - position_bias_compress, + position_bias, param->common_param->batch_size, param->common_param->src_seq_len, param->common_param->tgt_seq_len, @@ -255,9 +223,6 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa (param->attn.is_cross && param->attn.position_bias) ? 1 : int(param->common_param->head_num), (T)(param->attn.scale), param->common_param->stream); - printTensor("qk_bufMixMasked",qk_buf,10); - // std::cout<<"param->attn.scale: "<attn.scale<common_param->head_size; diff --git a/src/fastertransformer/layers/ms_layers/debug_utils.cc b/src/fastertransformer/layers/ms_layers/debug_utils.cc index 6e35f0e..4f9e5bd 100644 --- a/src/fastertransformer/layers/ms_layers/debug_utils.cc +++ b/src/fastertransformer/layers/ms_layers/debug_utils.cc @@ -29,6 +29,83 @@ void printTensor(char* str, T* input, int size) free(input_host); } +void printCommonParam(CommonParam param) +{ + std::cout<<"print common Param\n"; + std::cout<<"batch_size = "<common_param.batch_size * param->common_param.src_seq_len * param->common_param.hidden_size; size_t ffn_ws_size = @@ -35,14 +34,10 @@ size_t GetDecoderLayerWorkspaceSize(decoderParamRun* param) param->decoder.compress_buf = allocator.Malloc(compress_buffer_len * sizeof(T)); param->decoder.compress_buf2 = allocator.Malloc(compress_buffer_len * sizeof(T)); param->decoder.normed_from_tensor_buf = allocator.Malloc(attn_out_size * sizeof(T)); - // param->attn1.attn.d_sequence_length=nullptr; - // param->attn1.attn.padding_offset=nullptr; - // param->attn1.attn.d_sequence_length2=nullptr; - // param->attn1.attn.padding_offset2=nullptr; - int tmp =param->common_param.tgt_seq_len; + int tgt_seq_len =param->common_param.tgt_seq_len; param->common_param.tgt_seq_len = param->common_param.src_seq_len; param->decoder.attn_ws_buf = allocator.Malloc(GetAttnWorkspaceSize(&(param->attn1))); - param->common_param.tgt_seq_len = tmp; + param->common_param.tgt_seq_len = tgt_seq_len; param->decoder.attn_out_buf = allocator.Malloc(attn_out_size * sizeof(T)); allocator.Free(param->decoder.attn_ws_buf); if (!param->decoder.layernorm_post) @@ -50,10 +45,6 @@ size_t GetDecoderLayerWorkspaceSize(decoderParamRun* param) param->decoder.normed_attn_out_buf = allocator.Malloc(attn_out_size * sizeof(T)); if (param->decoder.layernorm_post) allocator.Free(param->decoder.normed_from_tensor_buf); - // param->attn2.attn.d_sequence_length=nullptr; - // param->attn2.attn.padding_offset=nullptr; - // param->attn2.attn.d_sequence_length2=nullptr; - // param->attn2.attn.padding_offset2=nullptr; param->decoder.attn2_ws_buf = allocator.Malloc(GetAttnWorkspaceSize(&(param->attn2))); param->decoder.attn2_out_buf = allocator.Malloc(attn_out_size * sizeof(T)); allocator.Free(param->decoder.attn2_ws_buf); @@ -102,7 +93,6 @@ void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, dec param->common_param.in_idx = 0; size_t h_token_num = param->common_param.h_token_num = param->common_param.batch_size * param->common_param.src_seq_len; size_t h_token_num2 = param->common_param.h_token_num2 = param->common_param.batch_size * param->common_param.tgt_seq_len; - param->decoder.padding_offset = nullptr; int* d_sequence_lengths = nullptr; int* d_sequence_lengths2 = nullptr; int* padding_offset = nullptr; @@ -111,45 +101,30 @@ void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, dec T* from_tensor = input_tensor; int idx_encoder_out = param->attn1.attn.position_bias ? 7 : 10; T* encoder_output = reinterpret_cast(inputs[idx_encoder_out]); - T* compress_buffer; - T* compress_buffer2; int *input_ids = reinterpret_cast(inputs[in_len-1]); int *input_ids2 = reinterpret_cast(inputs[in_len-2]); - printTensor("input_ids2",(int*)(input_ids2),1*128); - - compress_buffer = reinterpret_cast(static_cast(ws) + param->decoder.compress_buf); - compress_buffer2 = reinterpret_cast(static_cast(ws) + param->decoder.compress_buf2); + T* compress_buffer = reinterpret_cast(static_cast(ws) + param->decoder.compress_buf); + T* compress_buffer2 = reinterpret_cast(static_cast(ws) + param->decoder.compress_buf2); padding_offset = reinterpret_cast(static_cast(ws) + param->decoder.padding_offset_buf); padding_offset2 = reinterpret_cast(static_cast(ws) + param->decoder.padding_offset_buf2); d_sequence_lengths = reinterpret_cast(static_cast(ws) + param->decoder.d_sequence_lengths_offset_buf); d_sequence_lengths2 = reinterpret_cast(static_cast(ws) + param->decoder.d_sequence_lengths_offset_buf2); - // param->decoder.d_sequence_length = d_sequence_lengths; - // param->decoder.d_sequence_length2 = d_sequence_lengths2; size_t* d_token_num = reinterpret_cast(static_cast(ws) + param->decoder.d_token_num_buf); size_t* d_token_num2 = reinterpret_cast(static_cast(ws) + param->decoder.d_token_num_buf2); -std::cout<<"param->common_param.src_seq_len:"<common_param.src_seq_len<common_param.head_num * param->common_param.head_size"<common_param.head_num * param->common_param.head_size<common_param.src_seq_len, param); - if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { - param->common_param.h_token_num = h_token_num; - // param->decoder.padding_offset = padding_offset; - from_tensor = compress_buffer; - } + if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { + param->common_param.h_token_num = h_token_num; + from_tensor = compress_buffer; + } else { + padding_offset = nullptr; + } GetCompressBuffer(compress_buffer2, encoder_output, input_ids2, padding_offset2, d_sequence_lengths2, h_token_num2, d_token_num2, param->common_param.tgt_seq_len, param); - if (h_token_num2 * 2 <= param->common_param.batch_size * param->common_param.tgt_seq_len) { - param->common_param.h_token_num2 = h_token_num2; - // ppadding_offset2 = padding_offset2; - inputs[idx_encoder_out] = compress_buffer2; - } - else{ - padding_offset =nullptr; -padding_offset2=nullptr; - } - - if(padding_offset2 != nullptr) - std::cout<<"param->decoder.padding_offset2 != nullptr\n"; - if(padding_offset2 == nullptr) - std::cout<<"param->decoder.padding_offset2 == nullptr\n"; + if (h_token_num2 * 2 <= param->common_param.batch_size * param->common_param.tgt_seq_len) { + param->common_param.h_token_num2 = h_token_num2; + inputs[idx_encoder_out] = compress_buffer2; + } else { + padding_offset2 = nullptr; + } h_token_num = param->common_param.h_token_num; h_token_num2 = param->common_param.h_token_num2; T* attn_out = reinterpret_cast(static_cast(ws) + param->decoder.attn_out_buf); @@ -167,12 +142,10 @@ padding_offset2=nullptr; T* tmp_out1 = reinterpret_cast(output[0]); T* out_buf = tmp_out; if (padding_offset != nullptr) { - tmp_out1 = compress_buffer2; + tmp_out1 = compress_buffer; } T* gamma1 = reinterpret_cast(inputs[param->common_param.in_idx++]); T* beta1 = (param->decoder.has_beta) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; - std::cout<<"h_token_num decoder"<(from_tensor), gamma1, @@ -181,9 +154,6 @@ padding_offset2=nullptr; param->common_param.hidden_size, param->common_param.stream, param->decoder.eps1); -printTensor("encoder_output",encoder_output,10); - printTensor("normed_from_tensor",normed_from_tensor,10); - inputs[--param->common_param.in_idx] = normed_from_tensor; int in_idx = param->common_param.in_idx; // if attention is embedded inside an decoder - fuse the bias to next layer normalization @@ -191,11 +161,13 @@ printTensor("encoder_output",encoder_output,10); param->attn1.attn.projection_bias = false; param->attn1.attn.d_sequence_length = d_sequence_lengths; param->attn1.attn.padding_offset = padding_offset; - int tmp = param->common_param.tgt_seq_len; + param->attn1.attn.d_sequence_length2 = d_sequence_lengths; + param->attn1.attn.padding_offset2 = padding_offset; + int tgt_seq_len = param->common_param.tgt_seq_len; param->common_param.tgt_seq_len = param->common_param.src_seq_len; forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws); - param->common_param.tgt_seq_len = tmp; + param->common_param.tgt_seq_len = tgt_seq_len; param->attn1.attn.projection_bias = is_projection_bias; param->common_param.in_idx = param->common_param.in_idx + in_idx; T* projection_bias = @@ -213,8 +185,6 @@ printTensor("encoder_output",encoder_output,10); param->common_param.hidden_size, param->common_param.stream, param->decoder.eps2); -printTensor("normed_attn_out",normed_attn_out,10); - inputs[--param->common_param.in_idx] = normed_attn_out; in_idx = param->common_param.in_idx; is_projection_bias = param->attn2.attn.projection_bias; @@ -223,12 +193,8 @@ printTensor("normed_attn_out",normed_attn_out,10); param->attn2.attn.padding_offset = padding_offset; param->attn2.attn.d_sequence_length2 = d_sequence_lengths2; param->attn2.attn.padding_offset2 = padding_offset2; - // printTensor("inputs[param->common_param.in_idx]",(T*)(inputs[param->common_param.in_idx+1]),param->common_param.src_seq_len*param->common_param.head_num * param->common_param.head_size); forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws); - // param->decoder.d_sequence_length = param->attn2.attn.d_sequence_length; - // param->decoder.padding_offset = param->attn2.attn.padding_offset; - // h_token_num = h_token_num2; param->attn2.attn.projection_bias = is_projection_bias; param->common_param.in_idx = param->common_param.in_idx + in_idx; T* projection_bias2 = @@ -249,7 +215,6 @@ printTensor("normed_attn_out",normed_attn_out,10); param->decoder.eps3); } else { - invokeGeneralAddBiasResidualT5PreLayerNormCast(attn2_out, reinterpret_cast(normed_attn2_out), attn_out, @@ -262,7 +227,6 @@ printTensor("normed_attn_out",normed_attn_out,10); param->decoder.eps3); } inputs[--param->common_param.in_idx] = normed_attn2_out; - printTensor("attn2_out",attn2_out,10); if (param->ffn_param.ffn_param.ffn_fp16 == false) { forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, &(param->ffn_param), ffn_ws); } @@ -302,8 +266,6 @@ printTensor("normed_attn_out",normed_attn_out,10); } out_buf = tmp_out1; } - printTensor("out_buf",out_buf,10); - if (padding_offset != nullptr) { cudaMemsetAsync(output[0], 0, @@ -312,7 +274,6 @@ printTensor("normed_attn_out",normed_attn_out,10); invokeRebuildPadding( (T*)output[0], out_buf, padding_offset, h_token_num, param->common_param.hidden_size, param->common_param.stream); } - // printTensor("output[0]",(T*)output[0],param->common_param.src_seq_len*param->common_param.head_num * param->common_param.head_size); return; } diff --git a/src/fastertransformer/layers/ms_layers/encoder.cc b/src/fastertransformer/layers/ms_layers/encoder.cc index 6a6fb9a..fdb167c 100644 --- a/src/fastertransformer/layers/ms_layers/encoder.cc +++ b/src/fastertransformer/layers/ms_layers/encoder.cc @@ -36,10 +36,6 @@ size_t GetEncoderLayerWorkspaceSize(encoderParamRun* param) param->encoder.normed_from_tensor_buf = (!param->encoder.layernorm_post || param->attn.attn.position_bias) ? allocator.Malloc(normed_from_tensor_len * sizeof(T)) : -1; - // param->attn.attn.d_sequence_length=nullptr; - // param->attn.attn.padding_offset=nullptr; - // param->attn.attn.d_sequence_length2=nullptr; - // param->attn.attn.padding_offset2=nullptr; param->encoder.attn_ws_buf = allocator.Malloc(GetAttnWorkspaceSize(&(param->attn))); param->encoder.attn_out_buf = allocator.Malloc(attn_out_len * sizeof(T)); allocator.Free(param->encoder.d_token_num_buf); @@ -56,8 +52,8 @@ size_t GetEncoderLayerWorkspaceSize(encoderParamRun* param) allocator.Malloc(ffn_len * sizeof(T)); param->encoder.tmp_out_buf = param->ffn_param.ffn_param.ffn_fp16 ? allocator.Malloc(tmp_out_size * sizeof(half)) : allocator.Malloc(tmp_out_size * sizeof(T)); + param->encoder.norm_out_buf = allocator.Malloc(tmp_out_size * sizeof(T)); return allocator.total_size(); - // return 0; } template void GetCompressBuffer(T* compress_buffer, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, encoderParamRun* param) @@ -88,58 +84,23 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc { param->common_param.in_idx = 0; size_t h_token_num = param->common_param.batch_size * param->common_param.src_seq_len; - param->common_param.h_token_num = h_token_num; - param->encoder.padding_offset = nullptr; - int* padding_offset = nullptr; - int* d_sequence_lengths = nullptr; + param->common_param.h_token_num = param->common_param.h_token_num2 = h_token_num; T* input_tensor = reinterpret_cast(inputs[param->common_param.in_idx++]); T* from_tensor = input_tensor; - T* compress_buffer; int *input_ids = reinterpret_cast(inputs[in_len-1]); - compress_buffer = reinterpret_cast(static_cast(ws) + param->encoder.compress_buf); - padding_offset = reinterpret_cast(static_cast(ws) + param->encoder.padding_offset_buf); - d_sequence_lengths = reinterpret_cast(static_cast(ws) + param->encoder.d_sequence_lengths_offset_buf); - // param->encoder.d_sequence_length = d_sequence_lengths; + T* compress_buffer = reinterpret_cast(static_cast(ws) + param->encoder.compress_buf); + int* padding_offset = reinterpret_cast(static_cast(ws) + param->encoder.padding_offset_buf); + int* d_sequence_lengths = reinterpret_cast(static_cast(ws) + param->encoder.d_sequence_lengths_offset_buf); size_t* d_token_num = reinterpret_cast(static_cast(ws) + param->encoder.d_token_num_buf); param->common_param.eft = false; - // printTensor("from_tensor",(T*)(from_tensor),10); -// std::cout<<"param->common_param.head_num * param->common_param.head_size"<common_param.head_num * param->common_param.head_size<common_param.batch_size, d_sequence_lengths, param->common_param.src_seq_len, param->common_param.stream); - // invokeGetPaddingOffset(&h_token_num, - // d_token_num, - // padding_offset, - // d_sequence_lengths, - // param->common_param.batch_size, - // param->common_param.src_seq_len, - // param->common_param.stream); - // if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { - // param->common_param.eft = true; - // invokeRemovePadding(compress_buffer, - // (const T*)from_tensor, - // padding_offset, - // h_token_num, - // param->common_param.head_num * param->common_param.head_size, - // param->common_param.stream); - // param->common_param.h_token_num = h_token_num; - // param->encoder.padding_offset = padding_offset; - // from_tensor = compress_buffer; - // } GetCompressBuffer(compress_buffer, from_tensor, input_ids, padding_offset, d_sequence_lengths, h_token_num, d_token_num, param->common_param.src_seq_len, param); - if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { - param->common_param.h_token_num = h_token_num; - // param->encoder.padding_offset = padding_offset; - from_tensor = compress_buffer; - } - // std::cout<common_param.src_seq_len*512); - - if(padding_offset != nullptr) - std::cout<<"param->encoder.padding_offset != nullptr\n"; - if(padding_offset == nullptr) - std::cout<<"param->encoder.padding_offset == nullptr\n"; + if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { + param->common_param.h_token_num = h_token_num; + from_tensor = compress_buffer; + } else { + padding_offset = nullptr; + } h_token_num = param->common_param.h_token_num; - std::cout<<"h_token_num: "<(static_cast(ws) + param->encoder.attn_out_buf); T* normed_from_tensor = reinterpret_cast(static_cast(ws) + param->encoder.normed_from_tensor_buf); T* attn_ws = reinterpret_cast(static_cast(ws) + param->encoder.attn_ws_buf); @@ -154,10 +115,10 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc if (padding_offset != nullptr) { tmp_out1 = compress_buffer; } + T* norm_out = reinterpret_cast(static_cast(ws) + param->encoder.norm_out_buf); if (param->encoder.layernorm_post == false || param->attn.attn.position_bias) { T* gamma1 = reinterpret_cast(inputs[param->common_param.in_idx++]); T* beta1 = (param->encoder.has_beta) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; - // printTensor("from_tensor",(T*)(from_tensor),512*128); invokeGeneralT5LayerNorm(normed_from_tensor, reinterpret_cast(from_tensor), gamma1, @@ -166,23 +127,19 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->common_param.hidden_size, param->common_param.stream, param->encoder.eps1); - // std::cout<<"param->encoder.eps1: "<encoder.eps1<common_param.in_idx] = normed_from_tensor; - // if attention is embedded inside an encoder - fuse the bias to next layer normalization bool is_projection_bias = param->attn.attn.projection_bias; param->attn.attn.projection_bias = false; int in_idx = param->common_param.in_idx; param->attn.attn.d_sequence_length = d_sequence_lengths; param->attn.attn.padding_offset = padding_offset; + param->attn.attn.d_sequence_length2 = d_sequence_lengths; + param->attn.attn.padding_offset2 = padding_offset; forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn_out, 1, &(param->attn), attn_ws); - // printTensor("out\n",(T*)(attn_out),10); - param->common_param.in_idx = param->attn.common_param->in_idx + in_idx; param->attn.attn.projection_bias = is_projection_bias; T* projection_bias = @@ -202,8 +159,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->common_param.hidden_size, param->common_param.stream, param->encoder.eps2); - // printTensor("out\n",(T*)(normed_attn_out),10); - } else { invokeGeneralAddBiasResidualT5PreLayerNormCast(attn_out, @@ -244,9 +199,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->encoder.eps1); } } - // std::cout<<"param->encoder.eps2: "<encoder.eps2<common_param.in_idx] = normed_attn_out; if (param->ffn_param.ffn_param.ffn_fp16 == false) { forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, ¶m->ffn_param, ffn_ws); @@ -254,7 +206,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc else { forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, ¶m->ffn_param, ffn_ws); } - // isNan("tmp_out",(T*)tmp_out,param->common_param.src_seq_len*param->common_param.src_seq_len); T* ffn_bias = (param->ffn_param.ffn_param.ffn_bias) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; if (param->encoder.layernorm_post == true && !param->attn.attn.position_bias) { @@ -317,7 +268,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc out_buf = tmp_out1; } } - if (padding_offset != nullptr) { cudaMemsetAsync(output[0], 0, @@ -326,11 +276,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc invokeRebuildPadding( (T*)output[0], out_buf, padding_offset, h_token_num, param->common_param.hidden_size, param->common_param.stream); } - // std::cout<encoder.padding_offset != nullptr\n"; - if(padding_offset == nullptr) - std::cout<<"param->encoder.padding_offset == nullptr\n"; return; } diff --git a/src/fastertransformer/layers/ms_layers/param.h b/src/fastertransformer/layers/ms_layers/param.h index a103db3..b69a5ff 100644 --- a/src/fastertransformer/layers/ms_layers/param.h +++ b/src/fastertransformer/layers/ms_layers/param.h @@ -60,7 +60,6 @@ typedef struct { size_t qk_buf; size_t qkv_buf_2; size_t qkv_buf_3; - size_t position_bias_buf; bool mask; int* padding_offset; int* d_sequence_length; @@ -88,10 +87,6 @@ typedef struct { size_t ffn_ws_buf; size_t normed_attn_out_buf; size_t normed_attn2_out_buf; - int* padding_offset; - int* d_sequence_length; - int* padding_offset2; - int* d_sequence_length2; size_t compress_buf; size_t d_token_num_buf; size_t padding_offset_buf; @@ -121,12 +116,11 @@ typedef struct { size_t normed_attn_out_buf; size_t ffn_ws_buf; size_t tmp_out_buf; - int* padding_offset; - int* d_sequence_length; size_t compress_buf; size_t d_token_num_buf; size_t padding_offset_buf; size_t d_sequence_lengths_offset_buf; + size_t norm_out_buf; } encoderParam; typedef struct { diff --git a/src/fastertransformer/models/bert/Bert.cc b/src/fastertransformer/models/bert/Bert.cc index e0c056e..31ef8f0 100644 --- a/src/fastertransformer/models/bert/Bert.cc +++ b/src/fastertransformer/models/bert/Bert.cc @@ -255,7 +255,7 @@ void Bert::forward(std::vector* output_tensors, switch (attention_type_) { case AttentionType::UNFUSED_MHA: { invokeBuildEncoderAttentionMask( - attention_mask_, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); + attention_mask_, sequence_lengths, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); sync_check_cuda_error(); invokeGetPaddingOffset(&h_token_num, token_num_, @@ -281,7 +281,7 @@ void Bert::forward(std::vector* output_tensors, } case AttentionType::UNFUSED_PADDED_MHA: { invokeBuildEncoderAttentionMask( - attention_mask_, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); + attention_mask_, sequence_lengths, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); sync_check_cuda_error(); h_token_num = request_batch_size * request_seq_len; bert_input_ptr = (T*)input_tensors->at(0).data; diff --git a/src/fastertransformer/models/bert_int8/BertINT8.cc b/src/fastertransformer/models/bert_int8/BertINT8.cc index 687fcb8..872b383 100644 --- a/src/fastertransformer/models/bert_int8/BertINT8.cc +++ b/src/fastertransformer/models/bert_int8/BertINT8.cc @@ -180,7 +180,7 @@ void BertINT8::forward(std::vector* output_tensors, switch (attention_type_) { case AttentionType::UNFUSED_MHA: { invokeBuildEncoderAttentionMask( - attention_mask_, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); + attention_mask_, sequence_lengths, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); sync_check_cuda_error(); invokeGetPaddingOffset(&h_token_num, token_num_, @@ -206,7 +206,7 @@ void BertINT8::forward(std::vector* output_tensors, } case AttentionType::UNFUSED_PADDED_MHA: { invokeBuildEncoderAttentionMask( - attention_mask_, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); + attention_mask_, sequence_lengths, sequence_lengths, request_batch_size, request_seq_len, request_seq_len, stream_); sync_check_cuda_error(); h_token_num = request_batch_size * request_seq_len; bert_input_ptr = (T*)input_tensors->at(0).data; diff --git a/src/fastertransformer/models/vit/ViT.cc b/src/fastertransformer/models/vit/ViT.cc index 9fc6585..1595d33 100644 --- a/src/fastertransformer/models/vit/ViT.cc +++ b/src/fastertransformer/models/vit/ViT.cc @@ -415,7 +415,7 @@ bool ViTTransformer::setSeqLenVec(size_t batch_size) template void ViTTransformer::setDefaultMask(size_t batch_size) { - invokeBuildEncoderAttentionMask(mask_buf_, seq_len_vec_, batch_size, max_seq_len_, max_seq_len_, stream_); + invokeBuildEncoderAttentionMask(mask_buf_, seq_len_vec_, seq_len_vec_, batch_size, max_seq_len_, max_seq_len_, stream_); } template diff --git a/src/fastertransformer/models/vit_int8/ViTINT8.cc b/src/fastertransformer/models/vit_int8/ViTINT8.cc index d00d6d1..ed35a4e 100644 --- a/src/fastertransformer/models/vit_int8/ViTINT8.cc +++ b/src/fastertransformer/models/vit_int8/ViTINT8.cc @@ -462,7 +462,7 @@ bool ViTTransformerINT8::setSeqLenVec(size_t batch_size) template void ViTTransformerINT8::setDefaultMask(size_t batch_size) { - invokeBuildEncoderAttentionMask(mask_buf_, seq_len_vec_, batch_size, max_seq_len_, max_seq_len_, stream_); + invokeBuildEncoderAttentionMask(mask_buf_, seq_len_vec_, seq_len_vec_, batch_size, max_seq_len_, max_seq_len_, stream_); } template -- Gitee From e8822a18e749ad0936a7d2697ecc64134719073d Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 8 Feb 2023 14:17:57 +0200 Subject: [PATCH 4/8] fix --- .../kernels/bert_preprocess_kernels.cu | 45 +------------------ .../kernels/bert_preprocess_kernels.h | 3 -- .../layers/ms_layers/attention.cc | 12 ++--- 3 files changed, 8 insertions(+), 52 deletions(-) diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu index dbfd2f5..c8284a5 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu @@ -119,24 +119,6 @@ __global__ void buildEncoderAttentionMaskKernel(T* attention_mask, const int* q_ } } -template -__global__ void buildEncoderPositionBiasKernel(T* position_bias_src, T* position_bias_dst, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int src_seq_len, const int tgt_seq_len, const int head_num) -{ - // sequence_lengths: [batch_size] - // position_bias: [batch_size, head_num, max_seq_len, max_seq_len] - position_bias_src += blockIdx.x * head_num * src_seq_len * tgt_seq_len; - const int q_length = q_sequence_lengths[blockIdx.x]; - const int kv_length = kv_sequence_lengths[blockIdx.x]; - for (int i = threadIdx.x; i < head_num * src_seq_len * tgt_seq_len; i += blockDim.x) { - int row_id = i / src_seq_len; - int col_id = i % tgt_seq_len; - // if (row_id < length && col_id < length) { - // TODO (bhsueh) check this modification is ok or not on other rmodel - if (col_id < q_length && row_id < kv_length) { - position_bias_dst[i] = position_bias_src[i]; - } - } -} template void invokeBuildEncoderAttentionMask( @@ -144,13 +126,7 @@ void invokeBuildEncoderAttentionMask( { buildEncoderAttentionMaskKernel<<>>(attention_mask, q_sequence_lengths, kv_sequence_lengths, src_seq_len, tgt_seq_len); } -template -void invokeBuildEncoderPositionBias( - T* position_bias_src, T* position_bias_dst, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, const int head_num, cudaStream_t stream) -{ - cudaMemsetAsync(position_bias_dst, 0, batch_size * src_seq_len * tgt_seq_len * head_num * sizeof(T), stream); - buildEncoderPositionBiasKernel<<>>(position_bias_src, position_bias_dst, q_sequence_lengths, kv_sequence_lengths, src_seq_len, tgt_seq_len, head_num); -} + template void invokeBuildEncoderAttentionMask(float* attention_mask, const int* q_sequence_lengths, @@ -166,24 +142,7 @@ template void invokeBuildEncoderAttentionMask(half* attention_mask, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream); -template void invokeBuildEncoderPositionBias(float* position_bias_src, - float* position_bias_dst, - const int* q_sequence_lengths, - const int* kv_sequence_lengths, - const int batch_size, - const int src_seq_len, - const int tgt_seq_len, - const int head_num, - cudaStream_t stream); -template void invokeBuildEncoderPositionBias(half* attention_mask, - half* position_bias_dst, - const int* q_sequence_lengths, - const int* kv_sequence_lengths, - const int batch_size, - const int src_seq_len, - const int tgt_seq_len, - const int head_num, - cudaStream_t stream); + __global__ void getTrtPaddingOffsetKernel(int* trt_mha_padding_offset, const int* sequence_length, const int batch_size) { // use for get tensorrt fused mha padding offset diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.h b/src/fastertransformer/kernels/bert_preprocess_kernels.h index da1f79e..19c1b28 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.h +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.h @@ -33,9 +33,6 @@ void invokeGetPaddingOffset(size_t* h_token_num, template void invokeBuildEncoderAttentionMask( T* attention_mask, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream); -template -void invokeBuildEncoderPositionBias( - T* position_bias_src, T* position_bias_dst, const int* q_sequence_lengths, const int* kv_sequence_lengths, const int batch_size, const int src_seq_len, const int tgt_seq_len, const int head_num, cudaStream_t stream); template void invokeBuildSequenceLength(const T * input, int batch_size, int *sequnce_length, int max_seq_length, int hidden_size,cudaStream_t stream); diff --git a/src/fastertransformer/layers/ms_layers/attention.cc b/src/fastertransformer/layers/ms_layers/attention.cc index 1cb85a4..39ded5a 100644 --- a/src/fastertransformer/layers/ms_layers/attention.cc +++ b/src/fastertransformer/layers/ms_layers/attention.cc @@ -217,15 +217,15 @@ size_t GetAttnWorkspaceSize(attentionParamRun* param) size_t size = 0; typedef typename std::conditional::value, cutlass::half_t, float>::type Type; fusedCutlassMhaDispatch fuse; - // if (fuse.isSupport(param)) { - // size = fuse.getWorkspaceSize(param); - // param->attn.fmha_type = FmhaType_CutlassFix; - // } - // else { + if (fuse.isSupport(param)) { + size = fuse.getWorkspaceSize(param); + param->attn.fmha_type = FmhaType_CutlassFix; + } + else { unfusedMhaDispatch unfuse; size = unfuse.getWorkspaceSize(param); param->attn.fmha_type = FmhaType_UnFused; - // } + } return size; } -- Gitee From 13d76b4071420248787121bc6ea31177dd1af3cd Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Wed, 8 Feb 2023 15:22:54 +0200 Subject: [PATCH 5/8] fix issue --- src/fastertransformer/layers/ms_layers/encoder.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fastertransformer/layers/ms_layers/encoder.cc b/src/fastertransformer/layers/ms_layers/encoder.cc index fdb167c..0c05772 100644 --- a/src/fastertransformer/layers/ms_layers/encoder.cc +++ b/src/fastertransformer/layers/ms_layers/encoder.cc @@ -35,7 +35,7 @@ size_t GetEncoderLayerWorkspaceSize(encoderParamRun* param) param->encoder.compress_buf = allocator.Malloc(compress_buffer_len * sizeof(T)); param->encoder.normed_from_tensor_buf = (!param->encoder.layernorm_post || param->attn.attn.position_bias) ? allocator.Malloc(normed_from_tensor_len * sizeof(T)) : - -1; + 0; param->encoder.attn_ws_buf = allocator.Malloc(GetAttnWorkspaceSize(&(param->attn))); param->encoder.attn_out_buf = allocator.Malloc(attn_out_len * sizeof(T)); allocator.Free(param->encoder.d_token_num_buf); @@ -47,7 +47,7 @@ size_t GetEncoderLayerWorkspaceSize(encoderParamRun* param) ((!param->encoder.layernorm_post || param->attn.attn.position_bias) || param->ffn_param.ffn_param.ffn_fp16) ? param->ffn_param.ffn_param.ffn_fp16 ? allocator.Malloc(normed_attn_out_len * sizeof(half)) : allocator.Malloc(normed_attn_out_len * sizeof(T)) : - -1; + 0; param->encoder.ffn_ws_buf = param->ffn_param.ffn_param.ffn_fp16 ? allocator.Malloc(ffn_len * sizeof(half)) : allocator.Malloc(ffn_len * sizeof(T)); param->encoder.tmp_out_buf = param->ffn_param.ffn_param.ffn_fp16 ? allocator.Malloc(tmp_out_size * sizeof(half)) : -- Gitee From 08b942d080df2373b20860c58375bf428f881b3c Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Thu, 9 Feb 2023 18:28:53 +0200 Subject: [PATCH 6/8] add encoder+layernorm in ft --- examples/cpp/ms/ms.cc | 4 +-- .../kernels/bert_preprocess_kernels.cu | 1 - .../layers/ms_layers/attention.cc | 1 + .../layers/ms_layers/decoder.cc | 25 ++++++++++++++++++- .../layers/ms_layers/encoder.cc | 17 +++++++++++++ .../layers/ms_layers/param.h | 4 +++ 6 files changed, 48 insertions(+), 4 deletions(-) diff --git a/examples/cpp/ms/ms.cc b/examples/cpp/ms/ms.cc index 1fd9e8d..d5267f3 100644 --- a/examples/cpp/ms/ms.cc +++ b/examples/cpp/ms/ms.cc @@ -263,12 +263,12 @@ static float CompareData(const T* refOutput, int size, const T* msTensorData) static int x = 0; int s = std::min(10, size); if (x == 0) { - for (int j = 0; j < s; j++) { // std::min(50, size) + for (int j = 0; j < std::min(50, size); j++) { std::cout << static_cast(msTensorData[j]) << " "; } std::cout << std::endl; std::cout << "Data of Ref output : "; - for (int j = 0; j < 160; j++) { // std::min(50, size) + for (int j = 0; j < std::min(50, size); j++) { std::cout << static_cast(refOutput[j]) << " "; } std::cout << std::endl; diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu index c8284a5..a9b11a5 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu @@ -142,7 +142,6 @@ template void invokeBuildEncoderAttentionMask(half* attention_mask, const int src_seq_len, const int tgt_seq_len, cudaStream_t stream); - __global__ void getTrtPaddingOffsetKernel(int* trt_mha_padding_offset, const int* sequence_length, const int batch_size) { // use for get tensorrt fused mha padding offset diff --git a/src/fastertransformer/layers/ms_layers/attention.cc b/src/fastertransformer/layers/ms_layers/attention.cc index 39ded5a..20acb11 100644 --- a/src/fastertransformer/layers/ms_layers/attention.cc +++ b/src/fastertransformer/layers/ms_layers/attention.cc @@ -387,6 +387,7 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa invokeBuildEncoderAttentionMask( attention_mask, param->attn.d_sequence_length2, param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, param->common_param->tgt_seq_len, param->common_param->stream); } + // printTensor("attention_mask",attention_mask,128*128*6); T* in[] = {q_buf_2, output1, output2, attention_mask, position_bias}; if (param->attn.fmha_type == FmhaType_CutlassFix) { fusedCutlassMhaDispatch dispatch; diff --git a/src/fastertransformer/layers/ms_layers/decoder.cc b/src/fastertransformer/layers/ms_layers/decoder.cc index 215332d..d986e66 100644 --- a/src/fastertransformer/layers/ms_layers/decoder.cc +++ b/src/fastertransformer/layers/ms_layers/decoder.cc @@ -67,6 +67,10 @@ template size_t GetDecoderLayerWorkspaceSize(decoderParamRun* param); template void GetCompressBuffer(T* compress_buffer, T* from_tensor, int *input_ids, int* padding_offset, int* d_sequence_lengths, size_t &h_token_num, size_t* d_token_num, size_t seq_len, decoderParamRun* param) { + // if(seq_len == param->common_param.src_seq_len) + // invokeBuildSequenceLength( + // input_ids, param->common_param.batch_size, d_sequence_lengths, seq_len, param->common_param.hidden_size, param->common_param.stream); + // else invokeBuildSequenceLength( input_ids, param->common_param.batch_size, d_sequence_lengths, seq_len, param->common_param.stream); invokeGetPaddingOffset(&h_token_num, @@ -90,6 +94,7 @@ void GetCompressBuffer(T* compress_buffer, T* from_tensor, int *input_ids, int* template void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamRun* param, void* ws) { + std::cout<<"in_len: "<common_param.in_idx = 0; size_t h_token_num = param->common_param.h_token_num = param->common_param.batch_size * param->common_param.src_seq_len; size_t h_token_num2 = param->common_param.h_token_num2 = param->common_param.batch_size * param->common_param.tgt_seq_len; @@ -124,7 +129,8 @@ void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, dec inputs[idx_encoder_out] = compress_buffer2; } else { padding_offset2 = nullptr; - } + } + // printTensor("input_ids",input_ids,128*128*6); h_token_num = param->common_param.h_token_num; h_token_num2 = param->common_param.h_token_num2; T* attn_out = reinterpret_cast(static_cast(ws) + param->decoder.attn_out_buf); @@ -165,6 +171,9 @@ void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, dec param->attn1.attn.padding_offset2 = padding_offset; int tgt_seq_len = param->common_param.tgt_seq_len; param->common_param.tgt_seq_len = param->common_param.src_seq_len; + printTensor("decoder d_sequence_lengths",d_sequence_lengths,6); + printTensor("d_sequence_lengths2",d_sequence_lengths2,6); + forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws); param->common_param.tgt_seq_len = tgt_seq_len; @@ -266,6 +275,20 @@ void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, dec } out_buf = tmp_out1; } + if(param->decoder.is_layernorm){ + std::cout<<"is_layernorm\n"; + T* gamma4 = reinterpret_cast(inputs[param->common_param.in_idx++]); + T* beta4 = (param->decoder.has_beta) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; + invokeGeneralT5LayerNorm(tmp_out1, + tmp_out, + gamma4, + beta4, + h_token_num, + param->common_param.hidden_size, + param->common_param.stream, + param->decoder.eps4); + out_buf = tmp_out1; + } if (padding_offset != nullptr) { cudaMemsetAsync(output[0], 0, diff --git a/src/fastertransformer/layers/ms_layers/encoder.cc b/src/fastertransformer/layers/ms_layers/encoder.cc index 0c05772..61633a7 100644 --- a/src/fastertransformer/layers/ms_layers/encoder.cc +++ b/src/fastertransformer/layers/ms_layers/encoder.cc @@ -138,6 +138,8 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->attn.attn.padding_offset = padding_offset; param->attn.attn.d_sequence_length2 = d_sequence_lengths; param->attn.attn.padding_offset2 = padding_offset; + // printTensor("d_sequence_lengths",d_sequence_lengths,6); + forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn_out, 1, &(param->attn), attn_ws); param->common_param.in_idx = param->attn.common_param->in_idx + in_idx; @@ -268,6 +270,20 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc out_buf = tmp_out1; } } + if(param->encoder.is_layernorm){ + std::cout<<"is_layernorm\n"; + T* gamma4 = reinterpret_cast(inputs[param->common_param.in_idx++]); + T* beta4 = (param->encoder.has_beta) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; + invokeGeneralT5LayerNorm(tmp_out1, + tmp_out, + gamma4, + beta4, + h_token_num, + param->common_param.hidden_size, + param->common_param.stream, + param->encoder.eps3); + out_buf = tmp_out1; + } if (padding_offset != nullptr) { cudaMemsetAsync(output[0], 0, @@ -276,6 +292,7 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc invokeRebuildPadding( (T*)output[0], out_buf, padding_offset, h_token_num, param->common_param.hidden_size, param->common_param.stream); } + // printTensor("output[0]",(T*)output[0],128*768*6); return; } diff --git a/src/fastertransformer/layers/ms_layers/param.h b/src/fastertransformer/layers/ms_layers/param.h index ff0a3b2..78e5e1d 100644 --- a/src/fastertransformer/layers/ms_layers/param.h +++ b/src/fastertransformer/layers/ms_layers/param.h @@ -79,6 +79,7 @@ typedef struct { float eps1; float eps2; float eps3; + float eps4; bool layernorm_post; bool has_beta; size_t normed_from_tensor_buf; @@ -98,6 +99,7 @@ typedef struct { size_t d_token_num_buf2; size_t padding_offset_buf2; size_t d_sequence_lengths_offset_buf2; + bool is_layernorm; } decoderParam; typedef struct { @@ -111,6 +113,7 @@ typedef struct { typedef struct { float eps1; float eps2; + float eps3; bool layernorm_post; bool has_beta; size_t normed_from_tensor_buf; @@ -124,6 +127,7 @@ typedef struct { size_t padding_offset_buf; size_t d_sequence_lengths_offset_buf; size_t norm_out_buf; + bool is_layernorm; } encoderParam; typedef struct { -- Gitee From 24218968ad2e62541539a609550ebcd3a9f9e5a7 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Sun, 12 Feb 2023 14:34:24 +0200 Subject: [PATCH 7/8] fix for be --- .../layers/ms_layers/attention.cc | 18 +++++++++++++++--- .../layers/ms_layers/decoder.cc | 3 ++- .../layers/ms_layers/encoder.cc | 2 +- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/fastertransformer/layers/ms_layers/attention.cc b/src/fastertransformer/layers/ms_layers/attention.cc index 20acb11..3a870f9 100644 --- a/src/fastertransformer/layers/ms_layers/attention.cc +++ b/src/fastertransformer/layers/ms_layers/attention.cc @@ -135,7 +135,7 @@ public: param->common_param->batch_size * param->common_param->head_num, param->common_param->cublas_handle, param->common_param->algo); - if (param->attn.padding_offset == nullptr) { + if (param->attn.padding_offset == nullptr) { invokeTransposeQKV(static_cast(output[0]), static_cast(qkv_buf_2), param->common_param->batch_size, @@ -190,7 +190,7 @@ public: param->attn.output2 = allocator.Malloc(attn_out_size * sizeof(T)); allocator.Free(param->attn.qkv_buf); param->attn.qk_buf = 0; // not in use - param->attn.qkv_buf_2 = 0; // not in use + param->attn.qkv_buf_2 = allocator.Malloc(qkv_buf_2_len * sizeof(T)); param->attn.qkv_buf_3 = allocator.Malloc(qkv_buf_3_len * sizeof(T)); size_t size = 0; typedef typename std::conditional::value, cutlass::half_t, float>::type Type; @@ -207,7 +207,19 @@ public: void runMha(T* inputs[], int in_len, T* output[], int out_len, attentionParamRun* param, void* ws) { typedef typename std::conditional::value, cutlass::half_t, float>::type Type; - forward_fmha(reinterpret_cast(inputs), out_len, reinterpret_cast(output), 1, param, ws); + T* qkv_buf_2 = reinterpret_cast(static_cast(ws) + param->attn.qkv_buf_2); + forward_fmha(reinterpret_cast(inputs), out_len, reinterpret_cast(&qkv_buf_2), 1, param, ws); + if(param->attn.padding_offset != nullptr) + { + invokeRemovePadding((float*)(*output), + (const float*)qkv_buf_2, + param->attn.padding_offset, + param->common_param->h_token_num, + param->common_param->head_num * param->common_param->head_size, + param->common_param->stream); + } else { + *output = qkv_buf_2; + } } }; diff --git a/src/fastertransformer/layers/ms_layers/decoder.cc b/src/fastertransformer/layers/ms_layers/decoder.cc index d986e66..1782ef7 100644 --- a/src/fastertransformer/layers/ms_layers/decoder.cc +++ b/src/fastertransformer/layers/ms_layers/decoder.cc @@ -275,8 +275,9 @@ void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, dec } out_buf = tmp_out1; } + std::cout<<"param->decoder.is_layernorm : "<decoder.is_layernorm<decoder.is_layernorm){ - std::cout<<"is_layernorm\n"; + std::cout<<"decoder is_layernorm\n"; T* gamma4 = reinterpret_cast(inputs[param->common_param.in_idx++]); T* beta4 = (param->decoder.has_beta) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; invokeGeneralT5LayerNorm(tmp_out1, diff --git a/src/fastertransformer/layers/ms_layers/encoder.cc b/src/fastertransformer/layers/ms_layers/encoder.cc index 61633a7..2580abd 100644 --- a/src/fastertransformer/layers/ms_layers/encoder.cc +++ b/src/fastertransformer/layers/ms_layers/encoder.cc @@ -271,7 +271,7 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc } } if(param->encoder.is_layernorm){ - std::cout<<"is_layernorm\n"; + std::cout<<"encoder is_layernorm\n"; T* gamma4 = reinterpret_cast(inputs[param->common_param.in_idx++]); T* beta4 = (param->encoder.has_beta) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; invokeGeneralT5LayerNorm(tmp_out1, -- Gitee From 67eb629fd41e17962e901d5237744b3be967e6e3 Mon Sep 17 00:00:00 2001 From: batya kroizer Date: Sun, 12 Feb 2023 14:13:20 +0000 Subject: [PATCH 8/8] merge --- .../layers/ms_layers/attention.cc | 44 ++++++----- .../layers/ms_layers/debug_utils.cc | 77 +++++++++++++++++++ .../layers/ms_layers/debug_utils.h | 3 + .../layers/ms_layers/decoder.cc | 74 ++++-------------- .../layers/ms_layers/encoder.cc | 69 +++-------------- .../layers/ms_layers/param.h | 6 -- 6 files changed, 131 insertions(+), 142 deletions(-) diff --git a/src/fastertransformer/layers/ms_layers/attention.cc b/src/fastertransformer/layers/ms_layers/attention.cc index 6f29f24..2a5bdbc 100644 --- a/src/fastertransformer/layers/ms_layers/attention.cc +++ b/src/fastertransformer/layers/ms_layers/attention.cc @@ -69,12 +69,17 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa gemm_data_types[1] = CUDA_R_16F; gemm_data_types[2] = CUDA_R_16F; } + printAttnParamRunParam(*param); + for (int i = 0; i < 5; i++) + { + std::cout<<"i: "<attn.padding_offset == nullptr) {std::cout<<"nullptr"<attn.padding_offset != nullptr) {std::cout<<"error"<attn.padding_offset != nullptr) - std::cout<<"param->attn.padding_offset != nullptr\n"; - if(param->attn.padding_offset == nullptr) - std::cout<<"param->attn.padding_offset == nullptr\n"; if (param->attn.is_cross) { gemm_dims[0] = param->common_param->hidden_size; gemm_dims[1] = param->common_param->h_token_num; @@ -100,8 +105,12 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa gemm_dims[0] = 2 * param->common_param->hidden_size; gemm_dims[1] = param->common_param->h_token_num2; + gemm_dims[2] = param->common_param->hidden_size; + gemm_lds[0] = 2 * param->common_param->hidden_size; + gemm_lds[1] = param->common_param->hidden_size; gemm_lds[2] = 2 * param->common_param->hidden_size; + T* weight_kv = reinterpret_cast(inputs[param->common_param->in_idx++]); CublasGemmWrapper(weight_kv, encoder_output, @@ -114,8 +123,7 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa &beta, param->common_param->cublas_handle, param->common_param->algo); - printTensor("qkv_buf + (param->common_param->h_token_num) * param->common_param->hidden_size",qkv_buf + (param->common_param->h_token_num) * param->common_param->hidden_size,10); - + printTensor("qkv_buf + param->common_param->h_token_num * param->common_param->hidden_size",qkv_buf + param->common_param->h_token_num * param->common_param->hidden_size,10); T* bias_qkv = (param->attn.qkv_bias) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; if (param->attn.padding_offset == nullptr) { invokeCrossAddFusedQKVBiasTranspose(q_buf_2, @@ -129,11 +137,11 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa param->common_param->head_num, param->common_param->head_size, param->common_param->stream); + printTensor("q_buf_2",q_buf_2,10); + printTensor("output1",output1,10); + printTensor("output2",output2,10); } else{ - std::cout<<"param->common_param->h_token_num"<common_param->h_token_num<common_param->tgt_seq_len"<common_param->tgt_seq_len<attn.padding_offset2, param->common_param->stream); } - std::cout<<"param->common_param->h_token_num2: "<common_param->h_token_num2<(inputs[param->common_param->in_idx++]); @@ -168,6 +172,7 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa &beta, param->common_param->cublas_handle, param->common_param->algo); + printTensor("qkv_buf",qkv_buf,10); T* bias_qkv = (param->attn.qkv_bias) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; if (param->attn.padding_offset == nullptr) { invokeAddFusedQKVBiasTranspose(static_cast(q_buf_2), @@ -181,6 +186,9 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa param->common_param->head_size, 0, param->common_param->stream); + printTensor("q_buf_2",q_buf_2,10); + printTensor("output1",output1,10); + printTensor("output2",output2,10); } else { invokeAddFusedZP_QKVBiasTranspose(static_cast(q_buf_2), @@ -226,25 +234,23 @@ void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionPa printTensor("qk_buf",qk_buf,10); std::cout<<"param->common_param->tgt_seq_len"<common_param->tgt_seq_len<attn.mask) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; - // printTensor("attention_mask",attention_mask,1*128*128); T* position_bias = (param->attn.position_bias) ? reinterpret_cast(inputs[param->common_param->in_idx++]) : nullptr; if (param->attn.padding_offset != nullptr){ invokeBuildEncoderAttentionMask( - attention_mask, (param->attn.is_cross) ? param->attn.d_sequence_length2 : param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, (param->attn.is_cross) ? param->common_param->tgt_seq_len : param->common_param->src_seq_len, param->common_param->stream); + attention_mask, (param->attn.is_cross) ? param->attn.d_sequence_length2 : param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, param->common_param->tgt_seq_len, param->common_param->stream); if (position_bias!= nullptr) { invokeBuildEncoderPositionBias(position_bias, position_bias_compress, (param->attn.is_cross) ? param->attn.d_sequence_length2 : param->attn.d_sequence_length, param->common_param->batch_size, param->common_param->src_seq_len, - (param->attn.is_cross) ? param->common_param->tgt_seq_len : param->common_param->src_seq_len, + param->common_param->tgt_seq_len, (param->attn.is_cross) ? 1 : param->common_param->head_num, param->common_param->stream); - } + } else { + position_bias_compress = position_bias; } - if(param->attn.is_cross) printTensor("position_bias_invokeBuild",position_bias,256*128); - if(param->attn.is_cross) printTensor("mask_invokeBuild",attention_mask,256*128); invokeMixMaskedSoftMax(static_cast(qk_buf), attention_mask, position_bias_compress, diff --git a/src/fastertransformer/layers/ms_layers/debug_utils.cc b/src/fastertransformer/layers/ms_layers/debug_utils.cc index 6e35f0e..4f9e5bd 100644 --- a/src/fastertransformer/layers/ms_layers/debug_utils.cc +++ b/src/fastertransformer/layers/ms_layers/debug_utils.cc @@ -29,6 +29,83 @@ void printTensor(char* str, T* input, int size) free(input_host); } +void printCommonParam(CommonParam param) +{ + std::cout<<"print common Param\n"; + std::cout<<"batch_size = "<(static_cast(ws) + param->decoder.compress_buf); - compress_buffer2 = reinterpret_cast(static_cast(ws) + param->decoder.compress_buf2); + T* compress_buffer = reinterpret_cast(static_cast(ws) + param->decoder.compress_buf); + T* compress_buffer2 = reinterpret_cast(static_cast(ws) + param->decoder.compress_buf2); padding_offset = reinterpret_cast(static_cast(ws) + param->decoder.padding_offset_buf); padding_offset2 = reinterpret_cast(static_cast(ws) + param->decoder.padding_offset_buf2); d_sequence_lengths = reinterpret_cast(static_cast(ws) + param->decoder.d_sequence_lengths_offset_buf); d_sequence_lengths2 = reinterpret_cast(static_cast(ws) + param->decoder.d_sequence_lengths_offset_buf2); - // param->decoder.d_sequence_length = d_sequence_lengths; - // param->decoder.d_sequence_length2 = d_sequence_lengths2; size_t* d_token_num = reinterpret_cast(static_cast(ws) + param->decoder.d_token_num_buf); size_t* d_token_num2 = reinterpret_cast(static_cast(ws) + param->decoder.d_token_num_buf2); -std::cout<<"param->common_param.src_seq_len:"<common_param.src_seq_len<common_param.head_num * param->common_param.head_size"<common_param.head_num * param->common_param.head_size<common_param.src_seq_len, param); - if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { - param->common_param.h_token_num = h_token_num; - // param->decoder.padding_offset = padding_offset; - from_tensor = compress_buffer; - } - GetCompressBuffer(compress_buffer2, encoder_output, input_ids2, padding_offset2, d_sequence_lengths2, h_token_num2, d_token_num2, param->common_param.tgt_seq_len, param); - if (h_token_num2 * 2 <= param->common_param.batch_size * param->common_param.tgt_seq_len) { - param->common_param.h_token_num2 = h_token_num2; - // ppadding_offset2 = padding_offset2; - inputs[idx_encoder_out] = compress_buffer2; - } - else{ - padding_offset =nullptr; -padding_offset2=nullptr; - } - - if(padding_offset2 != nullptr) - std::cout<<"param->decoder.padding_offset2 != nullptr\n"; - if(padding_offset2 == nullptr) - std::cout<<"param->decoder.padding_offset2 == nullptr\n"; + // GetCompressBuffer(compress_buffer, from_tensor, input_ids, padding_offset, d_sequence_lengths, h_token_num, d_token_num,param->common_param.src_seq_len, param); + // if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { + // param->common_param.h_token_num = h_token_num; + // from_tensor = compress_buffer; + // } else{ + padding_offset = nullptr; + // } + // GetCompressBuffer(compress_buffer2, encoder_output, input_ids2, padding_offset2, d_sequence_lengths2, h_token_num2, d_token_num2, param->common_param.tgt_seq_len, param); + // if (h_token_num2 * 2 <= param->common_param.batch_size * param->common_param.tgt_seq_len) { + // param->common_param.h_token_num2 = h_token_num2; + // inputs[idx_encoder_out] = compress_buffer2; + // } else{ + padding_offset2 = nullptr; + // } h_token_num = param->common_param.h_token_num; h_token_num2 = param->common_param.h_token_num2; T* attn_out = reinterpret_cast(static_cast(ws) + param->decoder.attn_out_buf); @@ -167,12 +143,10 @@ padding_offset2=nullptr; T* tmp_out1 = reinterpret_cast(output[0]); T* out_buf = tmp_out; if (padding_offset != nullptr) { - tmp_out1 = compress_buffer2; + tmp_out1 = compress_buffer; } T* gamma1 = reinterpret_cast(inputs[param->common_param.in_idx++]); T* beta1 = (param->decoder.has_beta) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; - std::cout<<"h_token_num decoder"<(from_tensor), gamma1, @@ -181,9 +155,6 @@ padding_offset2=nullptr; param->common_param.hidden_size, param->common_param.stream, param->decoder.eps1); -printTensor("encoder_output",encoder_output,10); - printTensor("normed_from_tensor",normed_from_tensor,10); - inputs[--param->common_param.in_idx] = normed_from_tensor; int in_idx = param->common_param.in_idx; // if attention is embedded inside an decoder - fuse the bias to next layer normalization @@ -213,8 +184,6 @@ printTensor("encoder_output",encoder_output,10); param->common_param.hidden_size, param->common_param.stream, param->decoder.eps2); -printTensor("normed_attn_out",normed_attn_out,10); - inputs[--param->common_param.in_idx] = normed_attn_out; in_idx = param->common_param.in_idx; is_projection_bias = param->attn2.attn.projection_bias; @@ -223,12 +192,8 @@ printTensor("normed_attn_out",normed_attn_out,10); param->attn2.attn.padding_offset = padding_offset; param->attn2.attn.d_sequence_length2 = d_sequence_lengths2; param->attn2.attn.padding_offset2 = padding_offset2; - // printTensor("inputs[param->common_param.in_idx]",(T*)(inputs[param->common_param.in_idx+1]),param->common_param.src_seq_len*param->common_param.head_num * param->common_param.head_size); forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws); - // param->decoder.d_sequence_length = param->attn2.attn.d_sequence_length; - // param->decoder.padding_offset = param->attn2.attn.padding_offset; - // h_token_num = h_token_num2; param->attn2.attn.projection_bias = is_projection_bias; param->common_param.in_idx = param->common_param.in_idx + in_idx; T* projection_bias2 = @@ -249,7 +214,6 @@ printTensor("normed_attn_out",normed_attn_out,10); param->decoder.eps3); } else { - invokeGeneralAddBiasResidualT5PreLayerNormCast(attn2_out, reinterpret_cast(normed_attn2_out), attn_out, @@ -262,7 +226,6 @@ printTensor("normed_attn_out",normed_attn_out,10); param->decoder.eps3); } inputs[--param->common_param.in_idx] = normed_attn2_out; - printTensor("attn2_out",attn2_out,10); if (param->ffn_param.ffn_param.ffn_fp16 == false) { forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, &(param->ffn_param), ffn_ws); } @@ -302,8 +265,6 @@ printTensor("normed_attn_out",normed_attn_out,10); } out_buf = tmp_out1; } - printTensor("out_buf",out_buf,10); - if (padding_offset != nullptr) { cudaMemsetAsync(output[0], 0, @@ -312,7 +273,6 @@ printTensor("normed_attn_out",normed_attn_out,10); invokeRebuildPadding( (T*)output[0], out_buf, padding_offset, h_token_num, param->common_param.hidden_size, param->common_param.stream); } - // printTensor("output[0]",(T*)output[0],param->common_param.src_seq_len*param->common_param.head_num * param->common_param.head_size); return; } diff --git a/src/fastertransformer/layers/ms_layers/encoder.cc b/src/fastertransformer/layers/ms_layers/encoder.cc index 6a6fb9a..11e50e2 100644 --- a/src/fastertransformer/layers/ms_layers/encoder.cc +++ b/src/fastertransformer/layers/ms_layers/encoder.cc @@ -89,57 +89,24 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->common_param.in_idx = 0; size_t h_token_num = param->common_param.batch_size * param->common_param.src_seq_len; param->common_param.h_token_num = h_token_num; - param->encoder.padding_offset = nullptr; - int* padding_offset = nullptr; - int* d_sequence_lengths = nullptr; T* input_tensor = reinterpret_cast(inputs[param->common_param.in_idx++]); T* from_tensor = input_tensor; - T* compress_buffer; int *input_ids = reinterpret_cast(inputs[in_len-1]); - compress_buffer = reinterpret_cast(static_cast(ws) + param->encoder.compress_buf); - padding_offset = reinterpret_cast(static_cast(ws) + param->encoder.padding_offset_buf); - d_sequence_lengths = reinterpret_cast(static_cast(ws) + param->encoder.d_sequence_lengths_offset_buf); - // param->encoder.d_sequence_length = d_sequence_lengths; + T* compress_buffer = reinterpret_cast(static_cast(ws) + param->encoder.compress_buf); + int* padding_offset = reinterpret_cast(static_cast(ws) + param->encoder.padding_offset_buf); + int* d_sequence_lengths = reinterpret_cast(static_cast(ws) + param->encoder.d_sequence_lengths_offset_buf); size_t* d_token_num = reinterpret_cast(static_cast(ws) + param->encoder.d_token_num_buf); param->common_param.eft = false; - // printTensor("from_tensor",(T*)(from_tensor),10); -// std::cout<<"param->common_param.head_num * param->common_param.head_size"<common_param.head_num * param->common_param.head_size<common_param.batch_size, d_sequence_lengths, param->common_param.src_seq_len, param->common_param.stream); - // invokeGetPaddingOffset(&h_token_num, - // d_token_num, - // padding_offset, - // d_sequence_lengths, - // param->common_param.batch_size, - // param->common_param.src_seq_len, - // param->common_param.stream); + // GetCompressBuffer(compress_buffer, from_tensor, input_ids, padding_offset, d_sequence_lengths, h_token_num, d_token_num, param->common_param.src_seq_len, param); // if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { - // param->common_param.eft = true; - // invokeRemovePadding(compress_buffer, - // (const T*)from_tensor, - // padding_offset, - // h_token_num, - // param->common_param.head_num * param->common_param.head_size, - // param->common_param.stream); // param->common_param.h_token_num = h_token_num; - // param->encoder.padding_offset = padding_offset; // from_tensor = compress_buffer; - // } - GetCompressBuffer(compress_buffer, from_tensor, input_ids, padding_offset, d_sequence_lengths, h_token_num, d_token_num, param->common_param.src_seq_len, param); - if (h_token_num * 2 <= param->common_param.batch_size * param->common_param.src_seq_len) { - param->common_param.h_token_num = h_token_num; - // param->encoder.padding_offset = padding_offset; - from_tensor = compress_buffer; - } - // std::cout<common_param.src_seq_len*512); - - if(padding_offset != nullptr) - std::cout<<"param->encoder.padding_offset != nullptr\n"; - if(padding_offset == nullptr) - std::cout<<"param->encoder.padding_offset == nullptr\n"; + // } else{ + padding_offset = nullptr; + // } + std::cout<<"param->encoder.layernorm_post: "<encoder.layernorm_post<<"param->attn.attn.position_bias: "<attn.attn.position_bias<<" param->attn.attn.projection_bias : "<< param->attn.attn.projection_bias<<"(param->encoder.has_beta)"<encoder.has_beta<< + "(param->ffn_param.ffn_param.ffn_bias)"<ffn_param.ffn_param.ffn_bias<<"param->ffn_param.ffn_param.ffn_fp16"<ffn_param.ffn_param.ffn_fp16<common_param.h_token_num; - std::cout<<"h_token_num: "<(static_cast(ws) + param->encoder.attn_out_buf); T* normed_from_tensor = reinterpret_cast(static_cast(ws) + param->encoder.normed_from_tensor_buf); T* attn_ws = reinterpret_cast(static_cast(ws) + param->encoder.attn_ws_buf); @@ -157,7 +124,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc if (param->encoder.layernorm_post == false || param->attn.attn.position_bias) { T* gamma1 = reinterpret_cast(inputs[param->common_param.in_idx++]); T* beta1 = (param->encoder.has_beta) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; - // printTensor("from_tensor",(T*)(from_tensor),512*128); invokeGeneralT5LayerNorm(normed_from_tensor, reinterpret_cast(from_tensor), gamma1, @@ -166,14 +132,10 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->common_param.hidden_size, param->common_param.stream, param->encoder.eps1); - // std::cout<<"param->encoder.eps1: "<encoder.eps1<common_param.in_idx] = normed_from_tensor; - // if attention is embedded inside an encoder - fuse the bias to next layer normalization bool is_projection_bias = param->attn.attn.projection_bias; param->attn.attn.projection_bias = false; int in_idx = param->common_param.in_idx; @@ -181,8 +143,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->attn.attn.padding_offset = padding_offset; forward_attn( reinterpret_cast(&inputs[param->common_param.in_idx]), in_len, &attn_out, 1, &(param->attn), attn_ws); - // printTensor("out\n",(T*)(attn_out),10); - param->common_param.in_idx = param->attn.common_param->in_idx + in_idx; param->attn.attn.projection_bias = is_projection_bias; T* projection_bias = @@ -202,8 +162,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->common_param.hidden_size, param->common_param.stream, param->encoder.eps2); - // printTensor("out\n",(T*)(normed_attn_out),10); - } else { invokeGeneralAddBiasResidualT5PreLayerNormCast(attn_out, @@ -244,9 +202,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc param->encoder.eps1); } } - // std::cout<<"param->encoder.eps2: "<encoder.eps2<common_param.in_idx] = normed_attn_out; if (param->ffn_param.ffn_param.ffn_fp16 == false) { forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, ¶m->ffn_param, ffn_ws); @@ -254,7 +209,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc else { forward_ffn(reinterpret_cast(inputs), in_len, &tmp_out, 1, ¶m->ffn_param, ffn_ws); } - // isNan("tmp_out",(T*)tmp_out,param->common_param.src_seq_len*param->common_param.src_seq_len); T* ffn_bias = (param->ffn_param.ffn_param.ffn_bias) ? reinterpret_cast(inputs[param->common_param.in_idx++]) : nullptr; if (param->encoder.layernorm_post == true && !param->attn.attn.position_bias) { @@ -326,11 +280,6 @@ void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, enc invokeRebuildPadding( (T*)output[0], out_buf, padding_offset, h_token_num, param->common_param.hidden_size, param->common_param.stream); } - // std::cout<encoder.padding_offset != nullptr\n"; - if(padding_offset == nullptr) - std::cout<<"param->encoder.padding_offset == nullptr\n"; return; } diff --git a/src/fastertransformer/layers/ms_layers/param.h b/src/fastertransformer/layers/ms_layers/param.h index a103db3..b5cd6c4 100644 --- a/src/fastertransformer/layers/ms_layers/param.h +++ b/src/fastertransformer/layers/ms_layers/param.h @@ -88,10 +88,6 @@ typedef struct { size_t ffn_ws_buf; size_t normed_attn_out_buf; size_t normed_attn2_out_buf; - int* padding_offset; - int* d_sequence_length; - int* padding_offset2; - int* d_sequence_length2; size_t compress_buf; size_t d_token_num_buf; size_t padding_offset_buf; @@ -121,8 +117,6 @@ typedef struct { size_t normed_attn_out_buf; size_t ffn_ws_buf; size_t tmp_out_buf; - int* padding_offset; - int* d_sequence_length; size_t compress_buf; size_t d_token_num_buf; size_t padding_offset_buf; -- Gitee