From 0b2916cac3e1a7f1ce1ccab7f4fa2315358be0d2 Mon Sep 17 00:00:00 2001 From: suyafeng s00639171 Date: Mon, 4 Sep 2023 12:56:45 +0800 Subject: [PATCH] [feature]matmul backward auto binding --- codegen/autograd/aclnn_derivatives.yaml | 2 + torch_npu/csrc/aten/npu_native_functions.yaml | 12 +-- torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp | 8 ++ .../aten/ops/op_api/MatmulKernelNpuOpApi.cpp | 78 +++++++++---------- .../framework/autograd/FunctionsManual.cpp | 10 ++- .../csrc/framework/autograd/FunctionsManual.h | 4 + 6 files changed, 69 insertions(+), 45 deletions(-) diff --git a/codegen/autograd/aclnn_derivatives.yaml b/codegen/autograd/aclnn_derivatives.yaml index 89e061624f2..d2a3c3c11d8 100644 --- a/codegen/autograd/aclnn_derivatives.yaml +++ b/codegen/autograd/aclnn_derivatives.yaml @@ -1,3 +1,5 @@ # Defines derivative formulas and Python signatures of methods on Variable # #If you need any guidance, please refer to the comments in derivatives.yaml in PyTorch. +- name: matmul(Tensor self, Tensor other) -> Tensor + self, other: matmul_backward(grad, self, other, grad_input_mask) diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index cd502a07f77..0d76bfdb800 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -753,7 +753,7 @@ supported: op_api: True - is_pinned - func: is_set_to - op_api: False + op_api: False - func: isclose op_api: True - func: isfinite @@ -1672,12 +1672,12 @@ supported: op_api: False - func: silu_backward op_api: False + - func: matmul + op_api: True + - func: matmul.out + op_api: True autograd: - - func: matmul - op_api: False - - func: matmul.out - op_api: False - celu - celu_ - func: elu.out @@ -1977,6 +1977,8 @@ unsupported: - special_zeta.other_scalar_out custom: + - func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] grad_input_mask) -> (Tensor, Tensor) + op_api: True - func: _npu_storage_resize(Tensor self, int size) -> Tensor - func: npu_change_data_ptr(Tensor dst, Tensor src, int index) -> int - func: npu_transpose(Tensor self, int[] perm, bool require_contiguous=True) -> Tensor diff --git a/torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp b/torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp index 59b1c376e59..9e878ecdf83 100644 --- a/torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MatmulKernelNpu.cpp @@ -155,5 +155,13 @@ at::Tensor& NPUNativeFunctions::matmul_out(const at::Tensor & tensor1, const at: at::namedinference::propagate_names_if_nonempty(result, maybe_outnames); return result; } + +std::tuple NPUNativeFunctions::matmul_backward( + const at::Tensor &grad, const at::Tensor &self, const at::Tensor &other, + std::array grad_input_mask) { + AT_ERROR("!!!!!Caution temp codes!!!!!!!!!!! "); + return std::make_tuple(at::Tensor(), at::Tensor()); +} + } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp index 97cb5376808..42b01e13a5e 100644 --- a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp +++ b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp @@ -14,10 +14,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "torch_npu/csrc/aten/NPUNativeOpApiFunctions.h" #include "torch_npu/csrc/aten/ops/op_api/op_api_common.h" #include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include namespace at_npu { namespace native { @@ -29,14 +29,15 @@ using torch::autograd::AutogradContext; using torch::autograd::Function; using tensor_list = std::vector; -static c10::SmallVector get_output_size(const at::Tensor &tensor1, - const at::Tensor &tensor2) { +static c10::SmallVector +get_output_size(const at::Tensor &tensor1, const at::Tensor &tensor2) { c10::SmallVector output_size; auto dim_tensor1 = tensor1.dim(); auto dim_tensor2 = tensor2.dim(); TORCH_CHECK(dim_tensor1 > 0 && dim_tensor2 > 0, - "matmul got error dimentions: ", "(", dim_tensor1, ", ", dim_tensor2, ")"); + "matmul got error dimentions: ", "(", dim_tensor1, ", ", + dim_tensor2, ")"); if (dim_tensor1 == 1 && dim_tensor2 == 1) { output_size = {}; @@ -76,13 +77,15 @@ static c10::SmallVector get_output_size(const at::Tensor &tensor1 at::IntArrayRef batch_tensor1(tensor1.sizes().data(), dim_tensor1 - 2); int64_t p = tensor2.size(-1); at::IntArrayRef batch_tensor2(tensor2.sizes().data(), dim_tensor2 - 2); - std::vector expand_batch_portion = at::infer_size(batch_tensor1, batch_tensor2); + std::vector expand_batch_portion = + at::infer_size(batch_tensor1, batch_tensor2); c10::SmallVector output_expand_size(expand_batch_portion); output_expand_size.insert(output_expand_size.end(), {n, p}); output_size = output_expand_size; } else { - TORCH_CHECK(false, "matmul got error sizes: ", "(", dim_tensor1, ", ", dim_tensor2, ")"); + TORCH_CHECK(false, "matmul got error sizes: ", "(", dim_tensor1, ", ", + dim_tensor2, ")"); } return output_size; @@ -120,7 +123,8 @@ at::Tensor matmul_mat1_backward(const at::Tensor self, const at::Tensor other, at::Tensor output; if (mat1.dim() == 2 && mat2.dim() > 2) { // mm - output = OpPreparation::ApplyTensorWithoutFormat(mat1.sizes(), grad.options()); + output = + OpPreparation::ApplyTensorWithoutFormat(mat1.sizes(), grad.options()); mat2 = mat2.transpose(-2, -1); mat2 = mat2.reshape({-1, mat2.size(-1)}); grad = grad.view({grad.size(-2), -1}); @@ -130,7 +134,8 @@ at::Tensor matmul_mat1_backward(const at::Tensor self, const at::Tensor other, } else { // bmm mat2 = mat2.transpose(-2, -1); auto expend_sizes = get_output_size(grad, mat2); - output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, grad.options()); + output = + OpPreparation::ApplyTensorWithoutFormat(expend_sizes, grad.options()); matmul_implement_npu(output, grad, mat2); } @@ -160,7 +165,8 @@ at::Tensor matmul_mat2_backward(const at::Tensor self, const at::Tensor other, at::Tensor output; if (mat2.dim() == 2 && mat1.dim() > 2) { // mm - output = OpPreparation::ApplyTensorWithoutFormat(mat2.sizes(), mat1.options()); + output = + OpPreparation::ApplyTensorWithoutFormat(mat2.sizes(), mat1.options()); mat1 = mat1.reshape({-1, mat1.size(-1)}); grad = grad.reshape({-1, grad.size(-1)}); mat1 = mat1.transpose(-2, -1); @@ -169,22 +175,31 @@ at::Tensor matmul_mat2_backward(const at::Tensor self, const at::Tensor other, } else { // bmm mat1 = mat1.transpose(-2, -1); auto expend_sizes = get_output_size(mat1, grad); - output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, mat1.options()); + output = + OpPreparation::ApplyTensorWithoutFormat(expend_sizes, mat1.options()); matmul_implement_npu(output, mat1, grad); } return output; } -std::tuple matmul_backward(const at::Tensor &grad, - const at::Tensor &self, - const at::Tensor &other) { +std::tuple NPUNativeOpApiFunctions::matmul_backward( + const at::Tensor &grad, const at::Tensor &self, + const at::Tensor &other, std::array grad_input_mask) { if (!grad.defined()) { return std::make_tuple(at::Tensor(), at::Tensor()); } + // backward mat1 and mat2 separately - auto self_grad = matmul_mat1_backward(self, other, grad); - auto other_grad = matmul_mat2_backward(self, other, grad); + at::Tensor self_grad; + at::Tensor other_grad; + if (grad_input_mask[1]) { + other_grad = matmul_mat2_backward(self, other, grad); + } + + if (grad_input_mask[0]) { + self_grad = matmul_mat1_backward(self, other, grad); + } // strip added dim: (5,1)->(5) if (other.dim() == 1 && other_grad.size(-1) == 1) { @@ -198,35 +213,18 @@ at::Tensor matmul_forward(const at::Tensor &self, const at::Tensor &mat2) { at::NoNamesGuard guard; auto output_size = get_output_size(self, mat2); - auto out = OpPreparation::ApplyTensorWithoutFormat(output_size, self.options()); + auto out = + OpPreparation::ApplyTensorWithoutFormat(output_size, self.options()); matmul_implement_npu(out, self, mat2); return out; } -class NPUMatmulOpApiFunction: public torch::autograd::Function { -public: - static at::Tensor forward(AutogradContext *ctx, const at::Tensor &self, - const at::Tensor &other) { - at::AutoNonVariableTypeMode g; - ctx->save_for_backward({self, other}); - auto result = matmul_forward(self, other); - return result; - } - static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) { - auto saved = ctx->get_saved_variables(); - auto self = saved[0]; - auto other = saved[1]; - auto grads = matmul_backward(grad_outputs[0], self, other); - tensor_list output = {std::get<0>(grads), std::get<1>(grads)}; - return output; - } -}; - at::Tensor NPUNativeOpApiFunctions::matmul(const at::Tensor &tensor1, const at::Tensor &tensor2) { DO_COMPATIBILITY(aclnnMatmul, NPUNativeFunctions::matmul(tensor1, tensor2)); - auto maybe_outnames = at::namedinference::compute_matmul_outnames(tensor1, tensor2); - auto result = NPUMatmulOpApiFunction::apply(tensor1, tensor2); + auto maybe_outnames = + at::namedinference::compute_matmul_outnames(tensor1, tensor2); + auto result = matmul_forward(tensor1, tensor2); at::namedinference::propagate_names_if_nonempty(result, maybe_outnames); return result; } @@ -234,8 +232,10 @@ at::Tensor NPUNativeOpApiFunctions::matmul(const at::Tensor &tensor1, at::Tensor &NPUNativeOpApiFunctions::matmul_out(const at::Tensor &tensor1, const at::Tensor &tensor2, at::Tensor &result) { - DO_COMPATIBILITY(aclnnMatmul, NPUNativeFunctions::matmul_out(tensor1, tensor2, result)); - auto maybe_outnames = at::namedinference::compute_matmul_outnames(tensor1, tensor2); + DO_COMPATIBILITY(aclnnMatmul, + NPUNativeFunctions::matmul_out(tensor1, tensor2, result)); + auto maybe_outnames = + at::namedinference::compute_matmul_outnames(tensor1, tensor2); // matmul_out don't support backward auto output_size = get_output_size(tensor1, tensor2); OpPreparation::CheckOut({tensor1, tensor2}, result, tensor1, output_size); diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp index 6645f9ff3e0..a30a5821cad 100644 --- a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp +++ b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp @@ -43,7 +43,7 @@ #include #include -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/aten/NPUNativeOpApiFunctions.h" #include "FunctionsManual.h" // Helper functions for autogenerated code @@ -127,6 +127,14 @@ Tensor fast_gelu_backward(const Tensor& grad, const Tensor& self) { return at_npu::native::NPUNativeFunctions::npu_fast_gelu_backward(grad, self); } +std::tuple +matmul_backward(const at::Tensor &grad, const at::Tensor &self, + const at::Tensor &other, std::array grad_input_mask) { + + return at_npu::native::NPUNativeOpApiFunctions::matmul_backward( + grad, self, other, grad_input_mask); +} + std::tuple rotary_mul_backward( const Tensor& grad, const Tensor& self, diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.h b/torch_npu/csrc/framework/autograd/FunctionsManual.h index a406a3d1fec..69180b8735a 100644 --- a/torch_npu/csrc/framework/autograd/FunctionsManual.h +++ b/torch_npu/csrc/framework/autograd/FunctionsManual.h @@ -59,6 +59,10 @@ std::vector not_implemented_list(const char* name, const char* reason="" Tensor fast_gelu_backward(const Tensor& grad, const Tensor& self); +std::tuple +matmul_backward(const at::Tensor &grad, const at::Tensor &self, + const at::Tensor &other, std::array grad_input_mask); + std::tuple rotary_mul_backward( const Tensor& grad, const Tensor& self, -- Gitee