diff --git a/codegen/autograd/aclnn_derivatives.yaml b/codegen/autograd/aclnn_derivatives.yaml
deleted file mode 100644
index 2a2240682b98ec9c594e20b75a402e3a09dc0a37..0000000000000000000000000000000000000000
--- a/codegen/autograd/aclnn_derivatives.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-# Defines derivative formulas and Python signatures of methods on Variable
-#
-#If you need any guidance, please refer to the comments in derivatives.yaml in PyTorch.
-
-- name: matmul(Tensor self, Tensor other) -> Tensor
-  self, other: matmul_backward(grad, self, other, grad_input_mask)
-
diff --git a/codegen/autograd/gen_autograd.py b/codegen/autograd/gen_autograd.py
index ebcc24e0ea1bbaf582ebef0d1d29c07692f5718d..3a95edcb3a6ceadaa4897de37f0654a048d7ecc1 100644
--- a/codegen/autograd/gen_autograd.py
+++ b/codegen/autograd/gen_autograd.py
@@ -41,7 +41,6 @@ torch_npu/csrc/aten/
 import argparse
 import os
 from typing import List, Sequence
-
 from codegen.api.autograd import (
     match_differentiability_info, NativeFunctionWithDifferentiabilityInfo,
     DifferentiabilityInfo
@@ -57,7 +56,6 @@ from .gen_inplace_or_view_type import gen_inplace_or_view_type
 from .gen_variable_factories import gen_variable_factories
 from .load_derivatives import load_derivatives
 
-
 def gen_autograd(
     native_functions_path: str,
     out: str,
@@ -66,24 +64,11 @@ def gen_autograd(
 ) -> None:
     differentiability_infos = load_derivatives(
         os.path.join(autograd_dir, 'derivatives.yaml'), native_functions_path, npu_native_functions_path)
-
-    aclnn_differentiability_infos = load_derivatives(
-        os.path.join(autograd_dir, 'aclnn_derivatives.yaml'), native_functions_path, npu_native_functions_path)
-
     template_path = os.path.join(autograd_dir, 'templates')
-
     native_funcs = parse_native_and_custom_yaml(native_functions_path, npu_native_functions_path).native_functions
     funcs = filte_out_native_autograd_function(native_funcs, differentiability_infos)
-    aclnn_funcs = filte_out_aclnn_function(native_funcs, aclnn_differentiability_infos)
-
     funcs_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo] = []
-    aclnn_funcs_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo] = []
-
     funcs_with_diff_infos = match_differentiability_info(funcs, differentiability_infos)
-    aclnn_funcs_with_diff_infos = match_differentiability_info(aclnn_funcs, aclnn_differentiability_infos)
-    differentiability_infos = differentiability_infos + aclnn_differentiability_infos
-    funcs_with_diff_infos.extend(aclnn_funcs_with_diff_infos)
-
     torch_funcs_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo] = []
     npu_funcs_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo] = []
     for func in funcs_with_diff_infos:
@@ -109,41 +94,21 @@ def gen_autograd(
     # Generate variable_factories.h
     gen_variable_factories(out, native_functions_path, npu_native_functions_path, template_path)
 
-
 def filte_out_native_autograd_function(
     native_funcs: List[NativeFunction],
     differentiability_infos: Sequence[DifferentiabilityInfo],
 ):
     result: List[NativeFunction] = []
     derivatives_name_list: List[str] = []
-
     for info in differentiability_infos:
         derivatives_name_list.append(str(info.func.func.name))
     for funcs in native_funcs:
         func_name = str(funcs.func.name)
         func_base_name = str(funcs.func.name.name.base)
         if (func_name in derivatives_name_list) or (func_base_name in derivatives_name_list):
-                result.append(funcs)
+            result.append(funcs)
     return result
 
-def filte_out_aclnn_function(
-    native_funcs: List[NativeFunction],
-    aclnn_differentiability_infos: Sequence[DifferentiabilityInfo],
-):
-    result: List[NativeFunction] = []
-    derivatives_name_list: List[str] = []
-
-    for info in aclnn_differentiability_infos:
-        derivatives_name_list.append(str(info.func.func.name))
-    for funcs in native_funcs:
-        func_name = str(funcs.func.name)
-        func_base_name = str(funcs.func.name.name.base)
-        if (func_name in derivatives_name_list) or (func_base_name in derivatives_name_list):
-            if funcs.op_api is True:
-                result.append(funcs)
-    return result
-
-
 def main() -> None:
     parser = argparse.ArgumentParser(
         description='Generate autograd C++ files')
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index 4361f40953de7511633558042ac0ebedd18d98b7..524c2d9e5cca9e2fe9e9acc508d493cf08b84341 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -750,10 +750,6 @@ supported:
    op_api: True
  - func: masked_fill_.Tensor
    op_api: True
- - func: matmul
-   op_api: False
- - func: matmul.out
-   op_api: False
  - func: masked_scatter_
    op_api: False
  - func: masked_select
@@ -1359,6 +1355,10 @@ supported:
  - zeros_like
 
 autograd:
+  - func: matmul
+    op_api: True
+  - func: matmul.out
+    op_api: True
   - celu
   - celu_
   - func: elu.out
diff --git a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp
index c1263dee14a3eb35a9a51878a223833cd688406f..3d7fd0adb8b0d1c2c634b3b435234395b51e8fbb 100644
--- a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp
+++ b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp
@@ -14,13 +14,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <torch/csrc/autograd/custom_function.h>
+
 #include "torch_npu/csrc/aten/NPUNativeOpApiFunctions.h"
-#include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h"
 #include "torch_npu/csrc/aten/ops/op_api/op_api_common.h"
-#include "torch_npu/csrc/core/npu/register/OptionsManager.h"
-#include "torch_npu/csrc/framework/interface/EnvVariables.h"
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
-#include <ATen/NamedTensorUtils.h>
 
 namespace at_npu {
 namespace native {
@@ -28,14 +26,19 @@ namespace native {
 const int8_t ALLOW_FP32_DOWN_PRECISION = 1;
 const int8_t KEEP_DTYPE = 0;
 
+using torch::autograd::AutogradContext;
+using torch::autograd::Function;
+using tensor_list = std::vector<at::Tensor>;
+
 static c10::SmallVector<int64_t, SIZE> get_npu_output_size(const at::Tensor &tensor1,
                                                            const at::Tensor &tensor2) {
   c10::SmallVector<int64_t, SIZE> output_size;
   auto dim_tensor1 = tensor1.dim();
   auto dim_tensor2 = tensor2.dim();
 
-  TORCH_CHECK(dim_tensor1 > 0 && dim_tensor2 > 0, "matmul got error dimentions: ", "(", dim_tensor1,
-              ", ", dim_tensor2, ")");
+  TORCH_CHECK(dim_tensor1 > 0 && dim_tensor2 > 0,
+              "matmul got error dimentions: ", "(", dim_tensor1, ", ",
+              dim_tensor2, ")");
 
   if (dim_tensor1 == 1 && dim_tensor2 == 1) {
     output_size = {};
@@ -84,15 +87,16 @@ static c10::SmallVector<int64_t, SIZE> get_npu_output_size(const at::Tensor &ten
     output_size = output_expand_size;
 
   } else {
-    TORCH_CHECK(false, "matmul got error sizes: ", "(", dim_tensor1, ", ", dim_tensor2, ")");
+    TORCH_CHECK(false, "matmul got error sizes: ", "(", dim_tensor1, ", ",
+                dim_tensor2, ")");
   }
 
   return output_size;
 }
 
-static at::Tensor matmul_opt_npu(c10::optional<at::Tensor> out_opt,
-                                 const at::Tensor &tensor1,
-                                 const at::Tensor &tensor2) {
+static at::Tensor matmul_forward_npu(c10::optional<at::Tensor> out_opt,
+                                     const at::Tensor &tensor1,
+                                     const at::Tensor &tensor2) {
 
   at::NoNamesGuard guard;
   at::Tensor out;
@@ -101,7 +105,8 @@ static at::Tensor matmul_opt_npu(c10::optional<at::Tensor> out_opt,
     out = out_opt.value();
     OpPreparation::CheckOut({tensor1, tensor2}, out, tensor1, output_size);
   } else {
-    out = OpPreparation::ApplyTensorWithoutFormat(output_size, tensor1.options());
+    out =
+        OpPreparation::ApplyTensorWithoutFormat(output_size, tensor1.options());
   }
 
   // allow dicrease precision
@@ -110,11 +115,77 @@ static at::Tensor matmul_opt_npu(c10::optional<at::Tensor> out_opt,
   return out;
 }
 
+std::tuple<at::Tensor, at::Tensor> matmul_backward_npu(const at::Tensor &grad,
+                                                       const at::Tensor &self,
+                                                       const at::Tensor &other) {
+  if (!grad.defined()) {
+    return std::make_tuple(at::Tensor(), at::Tensor());
+  }
+
+  auto mat1 = self;
+  auto mat2 = other;
+  auto dim_tensor1 = self.dim();
+  auto dim_tensor2 = other.dim();
+  std::vector<int64_t> new_shape(grad.sizes().begin(), grad.sizes().end());
+  // unsqueese mat1 & mat2
+  if (dim_tensor1 == 1 && dim_tensor2 == 1) {
+    mat1 = self.view({1, self.size(0)});
+    mat2 = other.view({other.size(0), 1});
+    new_shape = {1, 1};
+  }
+  // unsqueese mat1
+  if (dim_tensor1 == 1 && dim_tensor2 > 1) {
+    mat1 = self.view({1, self.size(0)});
+    new_shape.insert(new_shape.begin() + new_shape.size() - 1, 1);
+  }
+  // unsqueese mat2
+  if (dim_tensor1 > 1 && dim_tensor2 == 1) {
+    mat2 = other.view({other.size(0), 1});
+    new_shape.insert(new_shape.end(), 1);
+  }
+
+  // backward input 0
+  auto grad_self = matmul_forward_npu(c10::nullopt, grad.view(new_shape),
+                                      mat2.transpose(-2, -1));
+  // backward input 1
+  auto grad_other = matmul_forward_npu(c10::nullopt, mat1.transpose(-2, -1),
+                                       grad.view(new_shape));
+  if (grad_other.size(-1) == 1 && other.dim() == 1) {
+    grad_other = grad_other.squeeze(-1);
+  }
+
+  return std::make_tuple(grad_self, grad_other);
+}
+
+class NPUMatmulOpApiFunction
+    : public torch::autograd::Function<NPUMatmulOpApiFunction> {
+public:
+  static at::Tensor forward(AutogradContext *ctx, const at::Tensor &self,
+                            const at::Tensor &other) {
+    at::AutoNonVariableTypeMode g;
+    ctx->save_for_backward({self, other});
+    auto result = matmul_forward_npu(c10::nullopt, self, other);
+    return result;
+  }
+
+  static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
+    auto saved = ctx->get_saved_variables();
+    auto self = saved[0];
+    auto other = saved[1];
+
+    auto grads = matmul_backward_npu(grad_outputs[0], self, other);
+    tensor_list output = {std::get<0>(grads), std::get<1>(grads)};
+
+    return output;
+  }
+};
+
 at::Tensor NPUNativeOpApiFunctions::matmul(const at::Tensor &tensor1,
                                            const at::Tensor &tensor2) {
   DO_COMPATIBILITY(aclnnMatmul, NPUNativeFunctions::matmul(tensor1, tensor2));
-  auto maybe_outnames = at::namedinference::compute_matmul_outnames(tensor1, tensor2);
-  auto result = matmul_opt_npu(c10::nullopt, tensor1, tensor2);
+  auto maybe_outnames =
+      at::namedinference::compute_matmul_outnames(tensor1, tensor2);
+  auto result = NPUMatmulOpApiFunction::apply(tensor1, tensor2);
   at::namedinference::propagate_names_if_nonempty(result, maybe_outnames);
   return result;
 }
@@ -122,9 +193,12 @@ at::Tensor NPUNativeOpApiFunctions::matmul(const at::Tensor &tensor1,
 at::Tensor &NPUNativeOpApiFunctions::matmul_out(const at::Tensor &tensor1,
                                                 const at::Tensor &tensor2,
                                                 at::Tensor &result) {
-  DO_COMPATIBILITY(aclnnMatmul, NPUNativeFunctions::matmul_out(tensor1, tensor2, result));
-  auto maybe_outnames = at::namedinference::compute_matmul_outnames(tensor1, tensor2);
-  matmul_opt_npu(c10::optional<at::Tensor>(result), tensor1, tensor2);
+  DO_COMPATIBILITY(aclnnMatmul,
+                   NPUNativeFunctions::matmul_out(tensor1, tensor2, result));
+  auto maybe_outnames =
+      at::namedinference::compute_matmul_outnames(tensor1, tensor2);
+  // matmul_out don't support
+  matmul_forward_npu(c10::optional<at::Tensor>(result), tensor1, tensor2);
   at::namedinference::propagate_names_if_nonempty(result, maybe_outnames);
   return result;
 }
diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
index 710b4c88cb667c1a61ead4e94ad7386a228bdb00..579056b2487d4b45e1707988f67ff682872aaa2d 100644
--- a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
+++ b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
@@ -44,7 +44,6 @@
 #include <torch/csrc/autograd/functions/utils.h>
 
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-#include "torch_npu/csrc/aten/NPUNativeOpApiFunctions.h"
 #include "FunctionsManual.h"
 
 // Helper functions for autogenerated code
@@ -128,57 +127,6 @@ Tensor fast_gelu_backward(const Tensor& grad, const Tensor& self) {
   return at_npu::native::NPUNativeFunctions::npu_fast_gelu_backward(grad, self);
 }
 
-std::tuple<Tensor, Tensor> matmul_backward(const Tensor &grad,
-                                           const Tensor &self, const Tensor &other,
-                                           std::array<bool, 2> grad_input_mask) {
-  if (!grad.defined()) {
-    return std::make_tuple(at::Tensor(), at::Tensor());
-  }
-
-  at::Tensor grad_self, grad_other;
-  if (!grad_input_mask[0] && !grad_input_mask[1]) {
-    return std::make_tuple(grad_self, grad_other);
-  }
-
-  auto mat1 = self;
-  auto mat2 = other;
-  auto dim_tensor1 = self.dim();
-  auto dim_tensor2 = other.dim();
-  std::vector<int64_t> new_shape(grad.sizes().begin(), grad.sizes().end());
-  // unsqueese mat1 & mat2
-  if (dim_tensor1 == 1 && dim_tensor2 == 1) {
-    mat1 = self.view({1, self.size(0)});
-    mat2 = other.view({other.size(0), 1});
-    new_shape = {1, 1};
-  }
-  // unsqueese mat1
-  if (dim_tensor1 == 1 && dim_tensor2 > 1) {
-    mat1 = self.view({1, self.size(0)});
-    new_shape.insert(new_shape.begin() + new_shape.size() - 1, 1);
-  }
-  // unsqueese mat2
-  if (dim_tensor1 > 1 && dim_tensor2 == 1) {
-    mat2 = other.view({other.size(0), 1});
-    new_shape.insert(new_shape.end(), 1);
-  }
-
-  // backward input 0
-  if (grad_input_mask[0]) {
-    grad_self = at_npu::native::NPUNativeOpApiFunctions::matmul(
-        grad.view(new_shape), mat2.transpose(-2, -1));
-  }
-  // backward input 1
-  if (grad_input_mask[1]) {
-    grad_other = at_npu::native::NPUNativeOpApiFunctions::matmul(
-        mat1.transpose(-2, -1), grad.view(new_shape));
-    if (grad_other.size(-1) == 1 && other.dim() == 1) {
-      grad_other = grad_other.squeeze(-1);
-    }
-  }
-
-  return std::make_tuple(grad_self, grad_other);
-}
-
 } // namespace details
 } // namespace generated
 } // namespace autograd
diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.h b/torch_npu/csrc/framework/autograd/FunctionsManual.h
index ff3b88f1216553240876194d1ebe2d25fd9125fc..53aa52b22bac8bd964881222143e27cee18d9de9 100644
--- a/torch_npu/csrc/framework/autograd/FunctionsManual.h
+++ b/torch_npu/csrc/framework/autograd/FunctionsManual.h
@@ -59,11 +59,6 @@ std::vector<Tensor> not_implemented_list(const char* name, const char* reason=""
 
 Tensor fast_gelu_backward(const Tensor& grad, const Tensor& self);
 
-std::tuple<Tensor, Tensor> matmul_backward(const Tensor &grad,
-                                           const Tensor &self,
-                                           const Tensor &other,
-                                           std::array<bool, 2> grad_input_mask);
-
 } // namespace details
 } // namespace generated
 } // namespace autograd