From 0b2916cac3e1a7f1ce1ccab7f4fa2315358be0d2 Mon Sep 17 00:00:00 2001
From: suyafeng s00639171 <sunyafeng1@hisilicon.com>
Date: Mon, 4 Sep 2023 12:56:45 +0800
Subject: [PATCH] [feature]matmul backward auto binding

---
 codegen/autograd/aclnn_derivatives.yaml       |  2 +
 torch_npu/csrc/aten/npu_native_functions.yaml | 12 +--
 torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp   |  8 ++
 .../aten/ops/op_api/MatmulKernelNpuOpApi.cpp  | 78 +++++++++----------
 .../framework/autograd/FunctionsManual.cpp    | 10 ++-
 .../csrc/framework/autograd/FunctionsManual.h |  4 +
 6 files changed, 69 insertions(+), 45 deletions(-)

diff --git a/codegen/autograd/aclnn_derivatives.yaml b/codegen/autograd/aclnn_derivatives.yaml
index 89e061624f2..d2a3c3c11d8 100644
--- a/codegen/autograd/aclnn_derivatives.yaml
+++ b/codegen/autograd/aclnn_derivatives.yaml
@@ -1,3 +1,5 @@
 # Defines derivative formulas and Python signatures of methods on Variable
 #
 #If you need any guidance, please refer to the comments in derivatives.yaml in PyTorch.
+- name: matmul(Tensor self, Tensor other) -> Tensor
+  self, other: matmul_backward(grad, self, other, grad_input_mask)
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index cd502a07f77..0d76bfdb800 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -753,7 +753,7 @@ supported:
    op_api: True
  - is_pinned
  - func: is_set_to
-   op_api: False 
+   op_api: False
  - func: isclose
    op_api: True
  - func: isfinite
@@ -1672,12 +1672,12 @@ supported:
    op_api: False
  - func: silu_backward
    op_api: False
+ - func: matmul
+   op_api: True
+ - func: matmul.out
+   op_api: True
 
 autograd:
-  - func: matmul
-    op_api: False
-  - func: matmul.out
-    op_api: False
   - celu
   - celu_
   - func: elu.out
@@ -1977,6 +1977,8 @@ unsupported:
   - special_zeta.other_scalar_out
 
 custom:
+  - func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] grad_input_mask) -> (Tensor, Tensor)
+    op_api: True
   - func: _npu_storage_resize(Tensor self, int size) -> Tensor
   - func: npu_change_data_ptr(Tensor dst, Tensor src, int index) -> int
   - func: npu_transpose(Tensor self, int[] perm, bool require_contiguous=True) -> Tensor
diff --git a/torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp b/torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp
index 59b1c376e59..9e878ecdf83 100644
--- a/torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp
@@ -155,5 +155,13 @@ at::Tensor& NPUNativeFunctions::matmul_out(const at::Tensor & tensor1, const at:
   at::namedinference::propagate_names_if_nonempty(result, maybe_outnames);
   return result;
 }
+
+std::tuple<at::Tensor, at::Tensor> NPUNativeFunctions::matmul_backward(
+    const at::Tensor &grad, const at::Tensor &self, const at::Tensor &other,
+    std::array<bool, 2> grad_input_mask) {
+  AT_ERROR("!!!!!Caution temp codes!!!!!!!!!!! ");
+  return std::make_tuple(at::Tensor(), at::Tensor());
+}
+
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp
index 97cb5376808..42b01e13a5e 100644
--- a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp
+++ b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp
@@ -14,10 +14,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <torch/csrc/autograd/custom_function.h>
 #include "torch_npu/csrc/aten/NPUNativeOpApiFunctions.h"
 #include "torch_npu/csrc/aten/ops/op_api/op_api_common.h"
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include <torch/csrc/autograd/custom_function.h>
 
 namespace at_npu {
 namespace native {
@@ -29,14 +29,15 @@ using torch::autograd::AutogradContext;
 using torch::autograd::Function;
 using tensor_list = std::vector<at::Tensor>;
 
-static c10::SmallVector<int64_t, SIZE> get_output_size(const at::Tensor &tensor1,
-                                                       const at::Tensor &tensor2) {
+static c10::SmallVector<int64_t, SIZE>
+get_output_size(const at::Tensor &tensor1, const at::Tensor &tensor2) {
   c10::SmallVector<int64_t, SIZE> output_size;
   auto dim_tensor1 = tensor1.dim();
   auto dim_tensor2 = tensor2.dim();
 
   TORCH_CHECK(dim_tensor1 > 0 && dim_tensor2 > 0,
-              "matmul got error dimentions: ", "(", dim_tensor1, ", ", dim_tensor2, ")");
+              "matmul got error dimentions: ", "(", dim_tensor1, ", ",
+              dim_tensor2, ")");
 
   if (dim_tensor1 == 1 && dim_tensor2 == 1) {
     output_size = {};
@@ -76,13 +77,15 @@ static c10::SmallVector<int64_t, SIZE> get_output_size(const at::Tensor &tensor1
     at::IntArrayRef batch_tensor1(tensor1.sizes().data(), dim_tensor1 - 2);
     int64_t p = tensor2.size(-1);
     at::IntArrayRef batch_tensor2(tensor2.sizes().data(), dim_tensor2 - 2);
-    std::vector<int64_t> expand_batch_portion = at::infer_size(batch_tensor1, batch_tensor2);
+    std::vector<int64_t> expand_batch_portion =
+        at::infer_size(batch_tensor1, batch_tensor2);
     c10::SmallVector<int64_t, SIZE> output_expand_size(expand_batch_portion);
     output_expand_size.insert(output_expand_size.end(), {n, p});
     output_size = output_expand_size;
 
   } else {
-    TORCH_CHECK(false, "matmul got error sizes: ", "(", dim_tensor1, ", ", dim_tensor2, ")");
+    TORCH_CHECK(false, "matmul got error sizes: ", "(", dim_tensor1, ", ",
+                dim_tensor2, ")");
   }
 
   return output_size;
@@ -120,7 +123,8 @@ at::Tensor matmul_mat1_backward(const at::Tensor self, const at::Tensor other,
 
   at::Tensor output;
   if (mat1.dim() == 2 && mat2.dim() > 2) { // mm
-    output = OpPreparation::ApplyTensorWithoutFormat(mat1.sizes(), grad.options());
+    output =
+        OpPreparation::ApplyTensorWithoutFormat(mat1.sizes(), grad.options());
     mat2 = mat2.transpose(-2, -1);
     mat2 = mat2.reshape({-1, mat2.size(-1)});
     grad = grad.view({grad.size(-2), -1});
@@ -130,7 +134,8 @@ at::Tensor matmul_mat1_backward(const at::Tensor self, const at::Tensor other,
   } else { // bmm
     mat2 = mat2.transpose(-2, -1);
     auto expend_sizes = get_output_size(grad, mat2);
-    output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, grad.options());
+    output =
+        OpPreparation::ApplyTensorWithoutFormat(expend_sizes, grad.options());
     matmul_implement_npu(output, grad, mat2);
   }
 
@@ -160,7 +165,8 @@ at::Tensor matmul_mat2_backward(const at::Tensor self, const at::Tensor other,
 
   at::Tensor output;
   if (mat2.dim() == 2 && mat1.dim() > 2) { // mm
-    output = OpPreparation::ApplyTensorWithoutFormat(mat2.sizes(), mat1.options());
+    output =
+        OpPreparation::ApplyTensorWithoutFormat(mat2.sizes(), mat1.options());
     mat1 = mat1.reshape({-1, mat1.size(-1)});
     grad = grad.reshape({-1, grad.size(-1)});
     mat1 = mat1.transpose(-2, -1);
@@ -169,22 +175,31 @@ at::Tensor matmul_mat2_backward(const at::Tensor self, const at::Tensor other,
   } else { // bmm
     mat1 = mat1.transpose(-2, -1);
     auto expend_sizes = get_output_size(mat1, grad);
-    output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, mat1.options());
+    output =
+        OpPreparation::ApplyTensorWithoutFormat(expend_sizes, mat1.options());
     matmul_implement_npu(output, mat1, grad);
   }
 
   return output;
 }
 
-std::tuple<at::Tensor, at::Tensor> matmul_backward(const at::Tensor &grad,
-                                                   const at::Tensor &self,
-                                                   const at::Tensor &other) {
+std::tuple<at::Tensor, at::Tensor> NPUNativeOpApiFunctions::matmul_backward(
+    const at::Tensor &grad, const at::Tensor &self,
+    const at::Tensor &other, std::array<bool, 2> grad_input_mask) {
   if (!grad.defined()) {
     return std::make_tuple(at::Tensor(), at::Tensor());
   }
+
   // backward mat1 and mat2 separately
-  auto self_grad = matmul_mat1_backward(self, other, grad);
-  auto other_grad = matmul_mat2_backward(self, other, grad);
+  at::Tensor self_grad;
+  at::Tensor other_grad;
+  if (grad_input_mask[1]) {
+    other_grad = matmul_mat2_backward(self, other, grad);
+  }
+
+  if (grad_input_mask[0]) {
+    self_grad = matmul_mat1_backward(self, other, grad);
+  }
 
   // strip added dim: (5,1)->(5)
   if (other.dim() == 1 && other_grad.size(-1) == 1) {
@@ -198,35 +213,18 @@ at::Tensor matmul_forward(const at::Tensor &self, const at::Tensor &mat2) {
 
   at::NoNamesGuard guard;
   auto output_size = get_output_size(self, mat2);
-  auto out = OpPreparation::ApplyTensorWithoutFormat(output_size, self.options());
+  auto out =
+      OpPreparation::ApplyTensorWithoutFormat(output_size, self.options());
   matmul_implement_npu(out, self, mat2);
   return out;
 }
 
-class NPUMatmulOpApiFunction: public torch::autograd::Function<NPUMatmulOpApiFunction> {
-public:
-  static at::Tensor forward(AutogradContext *ctx, const at::Tensor &self,
-                            const at::Tensor &other) {
-    at::AutoNonVariableTypeMode g;
-    ctx->save_for_backward({self, other});
-    auto result = matmul_forward(self, other);
-    return result;
-  }
-  static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
-    auto saved = ctx->get_saved_variables();
-    auto self = saved[0];
-    auto other = saved[1];
-    auto grads = matmul_backward(grad_outputs[0], self, other);
-    tensor_list output = {std::get<0>(grads), std::get<1>(grads)};
-    return output;
-  }
-};
-
 at::Tensor NPUNativeOpApiFunctions::matmul(const at::Tensor &tensor1,
                                            const at::Tensor &tensor2) {
   DO_COMPATIBILITY(aclnnMatmul, NPUNativeFunctions::matmul(tensor1, tensor2));
-  auto maybe_outnames = at::namedinference::compute_matmul_outnames(tensor1, tensor2);
-  auto result = NPUMatmulOpApiFunction::apply(tensor1, tensor2);
+  auto maybe_outnames =
+      at::namedinference::compute_matmul_outnames(tensor1, tensor2);
+  auto result = matmul_forward(tensor1, tensor2);
   at::namedinference::propagate_names_if_nonempty(result, maybe_outnames);
   return result;
 }
@@ -234,8 +232,10 @@ at::Tensor NPUNativeOpApiFunctions::matmul(const at::Tensor &tensor1,
 at::Tensor &NPUNativeOpApiFunctions::matmul_out(const at::Tensor &tensor1,
                                                 const at::Tensor &tensor2,
                                                 at::Tensor &result) {
-  DO_COMPATIBILITY(aclnnMatmul, NPUNativeFunctions::matmul_out(tensor1, tensor2, result));
-  auto maybe_outnames = at::namedinference::compute_matmul_outnames(tensor1, tensor2);
+  DO_COMPATIBILITY(aclnnMatmul,
+                   NPUNativeFunctions::matmul_out(tensor1, tensor2, result));
+  auto maybe_outnames =
+      at::namedinference::compute_matmul_outnames(tensor1, tensor2);
   // matmul_out don't support backward
   auto output_size = get_output_size(tensor1, tensor2);
   OpPreparation::CheckOut({tensor1, tensor2}, result, tensor1, output_size);
diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
index 6645f9ff3e0..a30a5821cad 100644
--- a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
+++ b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
@@ -43,7 +43,7 @@
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/autograd/functions/utils.h>
 
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/aten/NPUNativeOpApiFunctions.h"
 #include "FunctionsManual.h"
 
 // Helper functions for autogenerated code
@@ -127,6 +127,14 @@ Tensor fast_gelu_backward(const Tensor& grad, const Tensor& self) {
   return at_npu::native::NPUNativeFunctions::npu_fast_gelu_backward(grad, self);
 }
 
+std::tuple<at::Tensor, at::Tensor>
+matmul_backward(const at::Tensor &grad, const at::Tensor &self,
+                const at::Tensor &other, std::array<bool, 2> grad_input_mask) {
+
+  return at_npu::native::NPUNativeOpApiFunctions::matmul_backward(
+      grad, self, other, grad_input_mask);
+}
+
 std::tuple<Tensor, Tensor, Tensor> rotary_mul_backward(
     const Tensor& grad,
     const Tensor& self,
diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.h b/torch_npu/csrc/framework/autograd/FunctionsManual.h
index a406a3d1fec..69180b8735a 100644
--- a/torch_npu/csrc/framework/autograd/FunctionsManual.h
+++ b/torch_npu/csrc/framework/autograd/FunctionsManual.h
@@ -59,6 +59,10 @@ std::vector<Tensor> not_implemented_list(const char* name, const char* reason=""
 
 Tensor fast_gelu_backward(const Tensor& grad, const Tensor& self);
 
+std::tuple<at::Tensor, at::Tensor>
+matmul_backward(const at::Tensor &grad, const at::Tensor &self,
+                const at::Tensor &other, std::array<bool, 2> grad_input_mask);
+
 std::tuple<Tensor, Tensor, Tensor> rotary_mul_backward(
     const Tensor& grad,
     const Tensor& self,
-- 
Gitee