diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h
index 78db7293716c941633750d64aaca0064fbb10f64..780de9f7d9ee657427e0a41c7175905a52ab81f3 100644
--- a/impl/matmul/matmul_impl.h
+++ b/impl/matmul/matmul_impl.h
@@ -10127,10 +10127,11 @@ template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const Matmu
 __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::CopyCo22GMNZ2NDOnTheFly(
     const GlobalTensor<DstT>& gmC, const LocalTensor<DstT>& src, bool enSequentialWrite)
 {
+    uint32_t dimN = (Kc_ != 0) ? Kc_ : N_;
     const int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT);
     const int oneBlockCount = ONE_BLK_SIZE / sizeof(DstT);
     int calcWidth = var.baseUseN_ / blockCount;
-    int dstOffset = var.curM_ * var.tiling_->baseM * N_ + var.curN_ * var.tiling_->baseN;
+    int dstOffset = var.curM_ * var.tiling_->baseM * dimN + var.curN_ * var.tiling_->baseN;
     int blockLen = blockCount * sizeof(DstT) / ONE_BLK_SIZE;
     int srcRepeatGap = (var.blockUseM_ * BLOCK_CUBE * blockCount - blockCount) * sizeof(DstT) / ONE_BLK_SIZE;
     int tail = var.baseUseN_ % blockCount;
@@ -10142,7 +10143,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     }
     trans.SetSize(blockCount);
 
-    int offset = N_;
+    int offset = dimN;
     if (enSequentialWrite) {
         dstOffset = 0;
         offset = var.baseUseN_;
@@ -10406,6 +10407,7 @@ template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const Matmu
 __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::CopyCo22GMNZ2ND(
     const GlobalTensor<DstT>& gmC, LocalTensor<DstT>& src, bool enSequentialWrite)
 {
+    uint32_t dimN = (Kc_ != 0) ? Kc_ : N_;
     const int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT);
     int width = var.blockUseN_ * blockCount;
     if constexpr (IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value) {
@@ -10427,15 +10429,15 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     }
     trans.SetSize(transSize);
     bool isTragetAligned = (originalWidth % blockCount) == 0;
-    bool isGmAligned = ((N_ % blockCount) == 0 && (var.singleCoreN_ % blockCount) == 0);
+    bool isGmAligned = ((dimN % blockCount) == 0 && (var.singleCoreN_ % blockCount) == 0);
     if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) {
         isGmAligned = 1;
     }
-    ASCENDC_ASSERT((N_ >= width),
-                   { KERNEL_LOG(KERNEL_ERROR, "N_ is %d, width is %d, N_ should be no less than width", N_, width); });
-    int dstStride = (N_ - width) * sizeof(DstT) / ONE_BLK_SIZE;
-    int dstOffset = var.curM_ * var.tiling_->baseM * N_ + var.curN_ * var.tiling_->baseN;
-    int offset = N_;
+    ASCENDC_ASSERT((dimN >= width),
+                   { KERNEL_LOG(KERNEL_ERROR, "dimN is %d, width is %d, dimN should be no less than width", dimN, width); });
+    int dstStride = (dimN - width) * sizeof(DstT) / ONE_BLK_SIZE;
+    int dstOffset = var.curM_ * var.tiling_->baseM * dimN + var.curN_ * var.tiling_->baseN;
+    int offset = dimN;
     if (enSequentialWrite) {
         isGmAligned = (var.baseUseN_ % blockCount) == 0;
         dstStride = 0;
@@ -10451,7 +10453,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 isOdd = true;
             }
         }
-        bool isSingleCore = M_ <= var.singleCoreM_ && N_ <= var.singleCoreN_;
+        bool isSingleCore = M_ <= var.singleCoreM_ && dimN <= var.singleCoreN_;
         bool isMutiCoreNeedPad = !isSingleCore && !isComputeLineByLine;
         if (!isTragetAligned && (isSingleCore || isMutiCoreNeedPad) && !isOdd) {
             int32_t alignedSize = BLOCK_CUBE;
@@ -10494,7 +10496,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         blocklen = Ceil(blocklen, 2);
     }
     if (isComputeLineByLine) {
-        bool needPipe = N_ < BLOCK_CUBE;
+        bool needPipe = dimN < BLOCK_CUBE;
         if constexpr (IsSameType<SrcT, int8_t>::value) {
             CopyToGMForNotAligned(gmC, trans, blocklen, enSequentialWrite, isTragetAligned);
         } else {
@@ -10522,9 +10524,10 @@ template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const Matmu
 __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::CopyCo22UBNZ2ND(
     const LocalTensor<DstT>& dst, const LocalTensor<DstT>& src, bool enSequentialWrite)
 {
+    uint32_t dimN = (Kc_ != 0) ? Kc_ : N_;
     const int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT);
-    int dstOffset = var.curM_ * var.tiling_->baseM * N_ + var.curN_ * var.tiling_->baseN;
-    int offset = Ceil(N_, blockCount) * blockCount;
+    int dstOffset = var.curM_ * var.tiling_->baseM * dimN + var.curN_ * var.tiling_->baseN;
+    int offset = Ceil(dimN, blockCount) * blockCount;
     if (enSequentialWrite) {
         dstOffset = 0;
         offset = var.tiling_->baseN;
@@ -10924,8 +10927,8 @@ __aicore__ inline MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, M
     int32_t alignN;
     int32_t alignK;
     if (var.isTransposeB_) {
-        alignN = Ceil(var.singleCoreN_, c0Size_) * c0Size_;
-        alignK = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
+        alignN = Ceil(var.singleCoreN_, BLOCK_CUBE) * BLOCK_CUBE;
+        alignK = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
     } else {
         alignN = Ceil(var.singleCoreN_, c0Size_) * c0Size_;
         alignK = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp
index 65b0e5e33102ee07b7843f9288b645a11a20c933..f0ffbd45933ca7c2916c80619111e7e04543fc1f 100644
--- a/impl/matmul/matmul_tiling_algorithm.cpp
+++ b/impl/matmul/matmul_tiling_algorithm.cpp
@@ -1529,7 +1529,7 @@ void MatmulTilingAlgorithm::CalcLoadSize(const DimFactor& blockDims, const CoreS
     constexpr int32_t minKSize = 64;
     constexpr int32_t minTotalSize = 128;
     const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16
-    const int32_t m0 = min(minMNSize, min(coreStatus.m, minTotalSize / n0));
+    const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0)));
     const int32_t k0 = min(min(minKSize / m0, minKSize / n0), coreStatus.k);
     const int32_t dbBuffer = 2;
 
diff --git a/version.info b/version.info
index c1fd1fdd851ff93873b70743661fd3accf13efdf..41cbf7525f4d5a79085f5431fe26e3c83b0f58f6 100644
--- a/version.info
+++ b/version.info
@@ -1 +1 @@
-Version=7.5.T6.0
+Version=7.5.T11.0