From 7b7d8503bb2a71c772b0c0fff6776e8c0cf87bc7 Mon Sep 17 00:00:00 2001 From: chenyian Date: Wed, 30 Jul 2025 05:37:13 -0400 Subject: [PATCH 1/4] add fusedmatmul with kdnn --- tensorflow/core/kernels/matmul_op_fused.cc | 39 ++++++++++++- third_party/KDNN/kdnn_adapter.h | 67 +++++++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc index 7b9d82718..97bfde415 100644 --- a/tensorflow/core/kernels/matmul_op_fused.cc +++ b/tensorflow/core/kernels/matmul_op_fused.cc @@ -167,7 +167,44 @@ class FusedMatMulOp : public OpKernel { f(ctx->eigen_device(), out->flat()); return; } - +#if defined(__aarch64) + bool kdnn_enable =true; + char *kdnn_env = std::getenv("KDNN_ENABLE"); + char &kdnn_eigen_fused = std::getenv("KDNN_EIGEN_FUSED"); + if (kdnn_env && std::string(kdnn_env) == "off"){ + kdnn_enable = false; + } + if(kdnn_enable && std::is_same::value && !transpose_a_ && !transpose_b_){ + if (kdnn_eigen_fused){ + kdnnGemm(ctx, a, b,out,transpose_a_,transpose_b_); + const int m =out->dim_size(0); + const int n =out->dim_size(1); + const Tensor& bias = ctx->input(2); + float* out_data = out->flat().data(); + const float* bias_data = bias.flat().data(); + Eigen::TensorMap> C(out_data,m,n); + Eigen::TensorMap> bias_vec(bias_data,n); + auto bias_broadcasted = bias_vec.reshape(Eigen::array{1,n}) + .broadcast(Eigen::array{m,1}); + switch (fused_computation_){ + case FusedComputationType::kBiasAdd: + C.device(ctx->eigen_cpu_device()) = (C + bias_broadcasted); + break; + case FusedComputationType::kBiasAddWithRelu: + C.device(ctx->eigen_cpu_device()) = (C + bias_broadcasted).cwiseMax(0.0f); + break; + case FusedComputationType::kUndefined: + OP_REQUIRES_OK(ctx,errors::Internal("Fusion Type is undefined")); + break; + default: + OP_REQUIRES_OK(ctx,errors::Internal("Fusion Type is not supported")); + } + }else{ + kdnnFusedGemm(ctx, a, b,out,transpose_a_,transpose_b_,int(fused_computation_)); + } + return; + } +#endif auto launch = LaunchFusedMatMulOp(); launch(ctx, a, b, dim_pair, fused_computation_, fused_computation_args_, out); diff --git a/third_party/KDNN/kdnn_adapter.h b/third_party/KDNN/kdnn_adapter.h index 6a5adad9e..85403e151 100644 --- a/third_party/KDNN/kdnn_adapter.h +++ b/third_party/KDNN/kdnn_adapter.h @@ -3,7 +3,15 @@ namespace tensorflow { -inline void kdnnGemm(OpKernelContext* ctx, const Tensor& a, const Tensor& b, Tensor* out, +typedef enum +{ + FUSED_TYPE_UNDEFINED = 0, + FUSED_TYPE_BIAS =1 , + FUSED_TYPE_BIASRELU =2 +} kdnnFusedType; + +template +inline void kdnnGemm(ContextType* ctx, const Tensor& a, const Tensor& b, Tensor* out, bool trans_a_, bool trans_b) { int m = a.dim_size(0); int n = b.dim_size(1); @@ -24,4 +32,61 @@ inline void kdnnGemm(OpKernelContext* ctx, const Tensor& a, const Tensor& b, Ten gemm.Run(A, B, C); } +template +inline void kdnnFusedGemm(ContextType* ctx, const Tensor& a, const Tensor& b, Tensor* out, + bool trans_a_, bool trans_b, int fused_type) { + const Tensor& bias = ctx->input(2); + const auto data_ptr = [](const Tensor& tensor) -> const float*{ + return reinterpret_cast(tensor.tensor_data().data()); + }; + auto bias_data = data_ptr(bias); + + int m = a.dim_size(0); + int n = b.dim_size(1); + int k = a.dim_size(1); + const float *A = a.flat().data(); + const float *B = b.flat().data(); + float *C = out->flat().data(); + // intra_op thread_pool + thread::ThreadPool* thread_pool = + ctx->device() + ->tensorflow_cpu_worker_threads() + ->workers; + auto relu = [](float x) {return std::max(0.0f,x);}; + auto identity = [](float x) {return x;}; + auto matmul_bias_activation = [&](auto activation) { + int rows = m; + int cols = n; + kdnn::KDNNThreadPool eigen_tp(thread_pool); + const KDNN::TensorInfo srcInfo = {{m, k}, KDNN::Element::TypeT::F32, KDNN::Layout::AB}; + const KDNN::TensorInfo weightsInfo = {{k, n}, KDNN::Element::TypeT::F32, KDNN::Layout::AB}; + const KDNN::TensorInfo dstInfo = {{m, n}, KDNN::Element::TypeT::F32, KDNN::Layout::AB}; + KDNN::Gemm gemm(srcInfo, weightsInfo, dstInfo, &eigen_tp); + gemm.Run(A, B, C); + int cost_per_unit = 2*cols; + thread_pool->ParalleFor(rows,cost_per_unit,[&](int start_row,int end_row){ + for (int i=start_row; i < end_row; i++){ + for (int j=0; j < cols; j++){ + float val = C[i * cols + j] + bias_data[j]; + C[i *cols + j] = activation(val); + } + } + }); + + }; + switch (fused_type) { + case kdnnFusedType::FUSED_TYPE_BIAS: + matmul_bias_activation(identity); + break; + case kdnnFusedType::FUSED_TYPE_BIASRELU: + matmul_bias_activation(relu); + break; + case kdnnFusedType::FUSED_TYPE_UNDEFINED: + std::invalid_argument("Fusion type is undefined"); + break; + default: + std::invalid_argument("Fusion type is not supported"); + } +} + }// namespace tensorflow \ No newline at end of file -- Gitee From f0225acbbdac0ecdbebac89fcf952a3d8ffcc56b Mon Sep 17 00:00:00 2001 From: chenyian Date: Thu, 31 Jul 2025 03:34:20 -0400 Subject: [PATCH 2/4] align with matmul_op --- tensorflow/core/kernels/matmul_op_fused.cc | 6 +++++- third_party/KDNN/kdnn_adapter.h | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc index 97bfde415..9d894d68a 100644 --- a/tensorflow/core/kernels/matmul_op_fused.cc +++ b/tensorflow/core/kernels/matmul_op_fused.cc @@ -44,6 +44,10 @@ limitations under the License. #include "tensorflow/core/kernels/eigen_contraction_kernel.h" #endif +#if defined(ENABLE_KDNN) +#include "kdnn_adapter.h" +#endif + namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; @@ -167,7 +171,7 @@ class FusedMatMulOp : public OpKernel { f(ctx->eigen_device(), out->flat()); return; } -#if defined(__aarch64) +#if defined(ENABLE_KDNN) bool kdnn_enable =true; char *kdnn_env = std::getenv("KDNN_ENABLE"); char &kdnn_eigen_fused = std::getenv("KDNN_EIGEN_FUSED"); diff --git a/third_party/KDNN/kdnn_adapter.h b/third_party/KDNN/kdnn_adapter.h index 85403e151..32c09b507 100644 --- a/third_party/KDNN/kdnn_adapter.h +++ b/third_party/KDNN/kdnn_adapter.h @@ -64,7 +64,7 @@ inline void kdnnFusedGemm(ContextType* ctx, const Tensor& a, const Tensor& b, Te KDNN::Gemm gemm(srcInfo, weightsInfo, dstInfo, &eigen_tp); gemm.Run(A, B, C); int cost_per_unit = 2*cols; - thread_pool->ParalleFor(rows,cost_per_unit,[&](int start_row,int end_row){ + thread_pool->ParallelFor(rows,cost_per_unit,[&](int start_row,int end_row){ for (int i=start_row; i < end_row; i++){ for (int j=0; j < cols; j++){ float val = C[i * cols + j] + bias_data[j]; -- Gitee From bbcf2dc2dfaeb0ca9db6aeaaf09df5fa1809fece Mon Sep 17 00:00:00 2001 From: chenyian Date: Thu, 31 Jul 2025 22:46:07 -0400 Subject: [PATCH 3/4] align with KDNN_ENABLE to use KDNN_EIGEN_FUSED --- tensorflow/core/kernels/matmul_op_fused.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc index 9d894d68a..33465fb0e 100644 --- a/tensorflow/core/kernels/matmul_op_fused.cc +++ b/tensorflow/core/kernels/matmul_op_fused.cc @@ -174,12 +174,16 @@ class FusedMatMulOp : public OpKernel { #if defined(ENABLE_KDNN) bool kdnn_enable =true; char *kdnn_env = std::getenv("KDNN_ENABLE"); - char &kdnn_eigen_fused = std::getenv("KDNN_EIGEN_FUSED"); if (kdnn_env && std::string(kdnn_env) == "off"){ kdnn_enable = false; } if(kdnn_enable && std::is_same::value && !transpose_a_ && !transpose_b_){ - if (kdnn_eigen_fused){ + bool kdnn_eigen_enable =false; + char *kdnn_eigen_fused = std::getenv("KDNN_EIGEN_FUSED"); + if (kdnn_eigen_fused && std::string(kdnn_env) == "on"){ + kdnn_eigen_enable = true; + } + if (kdnn_eigen_enable){ kdnnGemm(ctx, a, b,out,transpose_a_,transpose_b_); const int m =out->dim_size(0); const int n =out->dim_size(1); -- Gitee From 033388c9f88540faa178f77c791555d967333de9 Mon Sep 17 00:00:00 2001 From: chenyian Date: Thu, 31 Jul 2025 22:50:41 -0400 Subject: [PATCH 4/4] fix typos --- tensorflow/core/kernels/matmul_op_fused.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc index 33465fb0e..2a63901cc 100644 --- a/tensorflow/core/kernels/matmul_op_fused.cc +++ b/tensorflow/core/kernels/matmul_op_fused.cc @@ -179,8 +179,8 @@ class FusedMatMulOp : public OpKernel { } if(kdnn_enable && std::is_same::value && !transpose_a_ && !transpose_b_){ bool kdnn_eigen_enable =false; - char *kdnn_eigen_fused = std::getenv("KDNN_EIGEN_FUSED"); - if (kdnn_eigen_fused && std::string(kdnn_env) == "on"){ + char *kdnn_eigen_env = std::getenv("KDNN_EIGEN_FUSED"); + if (kdnn_eigen_env && std::string(kdnn_eigen_env) == "on"){ kdnn_eigen_enable = true; } if (kdnn_eigen_enable){ -- Gitee