From 4784f2ffb35917e94e21b62ed0a6ee2cc67b0606 Mon Sep 17 00:00:00 2001 From: lingchongfeng Date: Thu, 7 Aug 2025 19:32:30 +0800 Subject: [PATCH] link ppmatmul with canndev: tmp --- src/kernels/include/asdops/params/matmul.h | 3 +- .../quant_batch_matmul/quant_batch_matmul.cpp | 36 +++++++++++++++++++ .../matmul/tiling/matmul_nd_tiling.cpp | 34 ++++++++++++++++++ .../kernels/matmul/tiling/matmul_nd_tiling.h | 2 ++ src/ops_infer/linear/linear_ops_runner.cpp | 33 +++++++++++++++++ src/ops_infer/linear/linear_ops_runner.h | 1 + 6 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 src/kernels/kernels/matmul/quant_batch_matmul/quant_batch_matmul.cpp diff --git a/src/kernels/include/asdops/params/matmul.h b/src/kernels/include/asdops/params/matmul.h index bf93c3f9..be9c1c52 100644 --- a/src/kernels/include/asdops/params/matmul.h +++ b/src/kernels/include/asdops/params/matmul.h @@ -23,7 +23,8 @@ struct MatMul { MATMUL_DEQUANT, // MATMUL_ACCUM_ATOMIC, // C += op(A) * op(B) MATMUL_WITH_BIAS, // C = op(A) * op(B) + Bias, where Bias is a vector. - MATMUL_EIN_SUM + MATMUL_EIN_SUM, + MATMUL_DEQUANT_FALLBACK }; enum class QuantMode : uint32_t { PER_CHANNEL_SYMM = 0, diff --git a/src/kernels/kernels/matmul/quant_batch_matmul/quant_batch_matmul.cpp b/src/kernels/kernels/matmul/quant_batch_matmul/quant_batch_matmul.cpp new file mode 100644 index 00000000..1eea2fe4 --- /dev/null +++ b/src/kernels/kernels/matmul/quant_batch_matmul/quant_batch_matmul.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include +#include +#include "asdops/params/params.h" +#include "kernels/matmul/tiling/matmul_nd_tiling.h" +#include "kernels/matmul/tiling/matmul_nz_tiling.h" + +namespace AsdOps { +class QuantBatchMatmulI8Kernel : public KernelBase { +public: + explicit QuantBatchMatmulI8Kernel(const std::string &kernelName, const BinHandle *handle) noexcept + : KernelBase(kernelName, handle) + { + } + + bool CanSupport(const LaunchParam &launchParam) const override + { + return true; + } + + Status InitImpl(const LaunchParam &launchParam) override + { + return QuantBatchMatmulNdTiling(GetName(), launchParam, kernelInfo_, *GetBinHandle()); + } +}; +REG_KERNEL_BASE(QuantBatchMatmulI8Kernel); +} // namespace AsdOps diff --git a/src/kernels/kernels/matmul/tiling/matmul_nd_tiling.cpp b/src/kernels/kernels/matmul/tiling/matmul_nd_tiling.cpp index f7bc35f4..3bf74990 100644 --- a/src/kernels/kernels/matmul/tiling/matmul_nd_tiling.cpp +++ b/src/kernels/kernels/matmul/tiling/matmul_nd_tiling.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include "asdops/params/params.h" #include "tbe_tiling_runner.h" #include "kernels/matmul/tiling/tiling_data.h" @@ -63,6 +64,39 @@ Status BatchMatMulNdTiling(const std::string &kernelName, const LaunchParam &lau return GetTilingFromRunner(kernelInfo, runner, binHandle); } +Status QuantBatchMatmulNdTiling(const std::string &kernelName, const LaunchParam &launchParam, KernelInfo &kernelInfo, + const BinHandle &binHandle) +{ + + auto opParam = AnyCast(launchParam.GetParam()); + const TensorDesc &tensorDescA = launchParam.GetInTensor(0).desc; + const TensorDesc &tensorDescB = launchParam.GetInTensor(1).desc; + const TensorDesc &tensorDescScale = launchParam.GetInTensor(2).desc; + const TensorDesc &tensorDescOffset = launchParam.GetInTensor(3).desc; + const TensorDesc &tensorDescBias = launchParam.GetInTensor(4).desc; + const TensorDesc &tensorDescPertoken = launchParam.GetInTensor(5).desc; + const TensorDesc &tensorDescOut = launchParam.GetOutTensor(0).desc; + + auto runner = AsdOpsGeRt::TbeTilingRunner() + .SetName("QuantBatchMatmulV3") + .SetKernelName(kernelName) + .AddInput(tensorDescA.dtype, tensorDescA.format, tensorDescA.dims) + .AddInput(tensorDescB.dtype, tensorDescB.format, tensorDescB.dims) + .AddInput(tensorDescScale.dtype, tensorDescScale.format, tensorDescScale.dims) + .AddInput(tensorDescOffset.dtype, tensorDescOffset.format, tensorDescOffset.dims) + .AddInput(tensorDescBias.dtype, tensorDescBias.format, tensorDescBias.dims) + .AddInput(tensorDescPertoken.dtype, tensorDescPertoken.format, tensorDescPertoken.dims) + .AddInput(tensorDescOut.dtype, tensorDescOut.format, tensorDescOut.dims) + .AddAttrInt(static_cast>(opParam.outDtype)) + .AddAttrBool(opParam.transposeA) + .AddAttrBool(opParam.transposeB) + .AddAttrBool(0); + + return GetTilingFromRunner(kernelInfo, runner, binHandle); + +} + + Status MatMulNdGemvTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) { auto inTensorsBDim = launchParam.GetInTensor(1).desc.dims; diff --git a/src/kernels/kernels/matmul/tiling/matmul_nd_tiling.h b/src/kernels/kernels/matmul/tiling/matmul_nd_tiling.h index 0e8c6896..7d43ff6f 100644 --- a/src/kernels/kernels/matmul/tiling/matmul_nd_tiling.h +++ b/src/kernels/kernels/matmul/tiling/matmul_nd_tiling.h @@ -21,6 +21,8 @@ Status MatMulNdTiling(const std::string &kernelName, const LaunchParam &launchPa const BinHandle &binHandle); Status BatchMatMulNdTiling(const std::string &kernelName, const LaunchParam &launchParam, KernelInfo &kernelInfo, const BinHandle &binHandle); +Status QuantBatchMatmulNdTiling(const std::string &kernelName, const LaunchParam &launchParam, KernelInfo &kernelInfo, + const BinHandle &binHandle); Status MatMulNdGemvTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo); } // namespace AsdOps diff --git a/src/ops_infer/linear/linear_ops_runner.cpp b/src/ops_infer/linear/linear_ops_runner.cpp index dac57202..9fc8f8a5 100644 --- a/src/ops_infer/linear/linear_ops_runner.cpp +++ b/src/ops_infer/linear/linear_ops_runner.cpp @@ -18,9 +18,11 @@ static constexpr size_t SIZE_2 = 2; static constexpr size_t SIZE_3 = 3; static constexpr size_t SIZE_4 = 4; static constexpr size_t SIZE_5 = 5; +static constexpr size_t SIZE_6 = 6; static constexpr size_t DIM_2 = 2; static constexpr size_t DIM_3 = 3; static constexpr int64_t DEFAULT_ALIGN = 16; +static constexpr int64_t ALIGNMENT_4 = 4; static constexpr int64_t INT8_ALIGN = 32; static constexpr int64_t MATMUL_TRANSPOSE_THRESHOLD = 65535; @@ -92,10 +94,15 @@ Status LinearOpsRunner::SetupKernelGraph(const OpsTensorPack &opsTensorPack) size_t inTensorId = 0; Mki::Tensor &xTensor = kernelGraph_.inTensors.at(inTensorId++); Mki::Tensor &weightTensor = kernelGraph_.inTensors.at(inTensorId++); + Mki::TensorDesc weightTensorDesc = weightTensor.desc; isWeightNz_ = weightTensor.desc.format == Mki::TENSOR_FORMAT_FRACTAL_NZ; SetupNeedMergeAxis(xTensor, weightTensor); SetupMatmulOriShape(xTensor, weightTensor); transdataNzToNdParam_.outCrops = {matmulParam_.oriShape.at(0), matmulParam_.oriShape.at(2)}; + const int64_t nModResult = weightTensorDesc.dims[DIM_2] % DEFAULT_ALIGN; + if (0 < nModResult && nModResult < ALIGNMENT_4) { + return SetupKernelGraphQuantBatchMatmul910B; + } if (matmulParam_.enDequant) { if (GetSingleton().Is910B() && param_.quantMode == infer::LinearParam::PER_TOKEN) { return SetupKernelGraphMatmulDequantPerToken910B(); @@ -163,6 +170,32 @@ Status LinearOpsRunner::SetupKernelGraphMatmul910B() return NO_ERROR; } +Status LinearOpsRunner::SetupKernelGraphQuantBatchMatmul910B() +{ + ATB_LOG(INFO) << GetLogPrefix() << "LinearOpsRunner::SetupKernelGraphQuantBatchMatmul910B"; + + InitKernelGraph(SIZE_6, 1, 0, 1); + + size_t inTensorId = 0; + Mki::Tensor &x1Tensor = kernelGraph_.inTensors.at(inTensorId); + Mki::Tensor &x2Tensor = kernelGraph_.inTensors.at(inTensorId); + Mki::Tensor &scaleTensor = kernelGraph_.inTensors.at(inTensorId); + Mki::Tensor &offsetTensor = nullTensor_; + Mki::Tensor &biasTensor = kernelGraph_.inTensors.at(inTensorId); + Mki::Tensor &pertokenTensor = nullTensor_; + + Mki::Tensor &outTensor = kernelGraph_.outTensors.at(0); + + KernelGraphNode &matmulNode = kernelGraph_.nodes.at(0); + + matmulParam_.matmulType = AsdOps::OpParam::MatMul::MatMulType::MATMUL_DEQUANT_FALLBACK; + matmulNode.opDesc = {0, "MatMulOperation", matmulParam_}; + matmulNode.inTensors = {&x1Tensor, &x2Tensor, &scaleTensor, &offsetTensor, &biasTensor, &pertokenTensor}; + matmulNode.outTensors = {&outTensor}; + + return NO_ERROR; +} + Status LinearOpsRunner::SetupKernelGraphMatmulWeightNdNot910B() { ATB_LOG(INFO) << GetLogPrefix() << "LinearOpsRunner::SetupKernelGraphMatmulWeightNdNotA2"; diff --git a/src/ops_infer/linear/linear_ops_runner.h b/src/ops_infer/linear/linear_ops_runner.h index 2e64dadd..ce3323f8 100644 --- a/src/ops_infer/linear/linear_ops_runner.h +++ b/src/ops_infer/linear/linear_ops_runner.h @@ -23,6 +23,7 @@ protected: Status SetupKernelGraph(const OpsTensorPack &opsTensorPack) override; private: + Status SetupKernelGraphQuantBatchMatmul910B(); Status SetupKernelGraphMatmul910B(); Status SetupKernelGraphMatmulWeightNdNot910B(); Status SetupKernelGraphMatmulWeightNzNot910B(); -- Gitee