From 413e33e6086f19b6ef576e3d092d709227ebebb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Fri, 11 Feb 2022 11:50:55 +0800 Subject: [PATCH 01/12] Add License for version file. --- torch_npu/version.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/torch_npu/version.py b/torch_npu/version.py index 32fb6cf7bf..8790208db4 100644 --- a/torch_npu/version.py +++ b/torch_npu/version.py @@ -1 +1,16 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + __version__ = "1.8.1rc1" -- Gitee From 83bb19062c26ed22fce1cebc77ad2b89fd142a00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Sat, 12 Feb 2022 11:12:53 +0800 Subject: [PATCH 02/12] Fix calling custom ops. --- .../PyTorch Operator Development Guide.md | 6 +++--- ...45\274\200\345\217\221\346\214\207\345\215\227.md" | 6 +++--- torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/common/TensorFactories.cpp | 11 ++++++----- torch_npu/csrc/aten/ops/AddKernelNpu.cpp | 6 +++--- torch_npu/csrc/aten/ops/AnyKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/BmmKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/DivKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/EqKernelNpu.cpp | 8 ++++---- torch_npu/csrc/aten/ops/GtKernelNpu.cpp | 8 ++++---- torch_npu/csrc/aten/ops/LtKernelNpu.cpp | 8 ++++---- torch_npu/csrc/aten/ops/MeanKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/MmKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/MulKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/NegKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/NormalKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/ReluKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/SubKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/SumKernelNpu.cpp | 2 +- .../csrc/aten/ops/ThresholdBackwardKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/TopKKernelNpu.cpp | 8 ++++---- torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp | 9 +++------ .../aten/ops/convolution/Conv2dBackwardKernelNpu.cpp | 8 ++++---- .../aten/ops/convolution/Conv3dBackwardKernelNpu.cpp | 6 +++--- .../aten/ops/convolution/ConvTranspose2dKernelNpu.cpp | 2 +- .../csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp | 4 ++-- torch_npu/csrc/distributed/reducer.cpp | 7 ++++--- torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp | 2 +- torch_npu/csrc/framework/contiguous/combined_opt.cpp | 4 +++- torch_npu/csrc/framework/utils/NpuUtils.cpp | 6 +++--- torch_npu/csrc/framework/utils/OpPreparation.cpp | 6 ++++-- 36 files changed, 84 insertions(+), 81 deletions(-) diff --git a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md index 48ddd58898..698b2ac1c7 100644 --- a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md +++ b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md @@ -515,7 +515,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a Scalar other_c1_offset( other.storage_offset() / (other.size(2) * other.size(3) * c0_len)); Scalar stride_len(self.size(1) / c0_len); - Tensor result = at::npu_stride_add( + Tensor result = NPUNativeFunctions::npu_stride_add( self_use, other_use, self_c1_offset, other_c1_offset, stride_len); return result; } @@ -524,7 +524,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( + Tensor result = NPUNativeFunctions::empty_with_format( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -541,7 +541,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a // calculate the output size auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( + Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" index f48ac35865..d38aa6efd3 100644 --- "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -515,7 +515,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 Scalar other_c1_offset( other.storage_offset() / (other.size(2) * other.size(3) * c0_len)); Scalar stride_len(self.size(1) / c0_len); - Tensor result = at::npu_stride_add( + Tensor result = NPUNativeFunctions::npu_stride_add( self_use, other_use, self_c1_offset, other_c1_offset, stride_len); return result; } @@ -524,7 +524,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( + Tensor result = NPUNativeFunctions::empty_with_format( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -541,7 +541,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 // calculate the output size auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( + Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp index 2139cfdb4a..506686f3c6 100644 --- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp @@ -76,7 +76,7 @@ at::Tensor NPUNativeFunctions::npu_format_cast( TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half, "can not cast format when src is not float32 or float16"); - at::Tensor dst = at::empty_with_format( + at::Tensor dst = NPUNativeFunctions::empty_with_format( src_desc.base_sizes_, src.options(), acl_format); // calculate the output result of the NPU @@ -105,7 +105,7 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_( TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half, "can not cast format when src is not float32 or float16"); - at::Tensor dst = at::empty_with_format( + at::Tensor dst = NPUNativeFunctions::empty_with_format( src_desc.base_sizes_, src.options(), acl_format); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp index 3d95b195a1..fdb4634709 100644 --- a/torch_npu/csrc/aten/common/TensorFactories.cpp +++ b/torch_npu/csrc/aten/common/TensorFactories.cpp @@ -30,11 +30,12 @@ #include #include #include +#include + #include "torch_npu/csrc/aten/common/ResizeNpu.h" #include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/framework/InferFormat.h" #include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h" -#include #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "torch_npu/csrc/core/tensor_impl.h" @@ -230,7 +231,7 @@ namespace at_npu { auto npu_format = self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_; - result = at::empty_with_format(self.sizes(), self.options(), npu_format); + result = NPUNativeFunctions::empty_with_format(self.sizes(), self.options(), npu_format); } } @@ -347,7 +348,7 @@ namespace at_npu options.layout(layout_opt); options.pinned_memory(pin_memory_opt); at::Tensor result = - at::empty_with_format(size, options, dst_format); + NPUNativeFunctions::empty_with_format(size, options, dst_format); if (names.has_value()) { internal_set_names_inplace(result, names); @@ -361,7 +362,7 @@ namespace at_npu int64_t dst_format) { at::Tensor result = - at::empty_with_format(size, options, dst_format); + NPUNativeFunctions::empty_with_format(size, options, dst_format); if (names.has_value()) { internal_set_names_inplace(result, names); @@ -376,7 +377,7 @@ namespace at_npu int64_t dst_format) { at::Tensor result = - at::empty_with_format(size, options, dst_format); + NPUNativeFunctions::empty_with_format(size, options, dst_format); if (names.has_value()) { internal_set_names_inplace(result, names); diff --git a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp index 363599e38a..08793a06e7 100644 --- a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp @@ -170,7 +170,7 @@ namespace at_npu else { c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - at::Tensor src_new = at::empty_with_format( + at::Tensor src_new = NPUNativeFunctions::empty_with_format( src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); src_new.set_( src.storage(), @@ -206,7 +206,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -223,7 +223,7 @@ namespace at_npu // calculate the output size auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp index 7cdac57d78..63b1271f93 100644 --- a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp @@ -70,7 +70,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self, int64_t dim, bool kee auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU @@ -88,7 +88,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self) { // when self's dim = 0, convert [1] tensor and reduce it if (self.dim() == 0) { at::Tensor self_tmp = self; - self_tmp = at::empty_with_format( + self_tmp = NPUNativeFunctions::empty_with_format( {1}, self.options().dtype(at::ScalarType::Float), CalcuOpUtil::get_tensor_npu_format(self)) diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp index 38aceb87fd..c93fbe3cc8 100644 --- a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp @@ -73,9 +73,9 @@ at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去 if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) && !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) { - result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); + result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND); + result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND); } // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp index 79c6fdae29..5903959bab 100644 --- a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp @@ -53,7 +53,7 @@ namespace at_npu input = input.to(at::kInt); } - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( size, input.options(), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp index a033c8c511..76865efccb 100644 --- a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp @@ -85,7 +85,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -102,7 +102,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp index 3dff364228..90b74193dc 100644 --- a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp @@ -95,7 +95,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -113,7 +113,7 @@ namespace at_npu auto outputSize = input_same_output_size(formatCastOfSelf); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -131,7 +131,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( self.sizes(), self.options().dtype(c10::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); @@ -159,7 +159,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( self.sizes(), self.options().dtype(c10::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp index 50d4c3fbcd..d27161e162 100644 --- a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp @@ -104,7 +104,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -121,7 +121,7 @@ namespace at_npu auto outputSize = input_same_output_size(formatCastOfSelf); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -139,7 +139,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); @@ -167,7 +167,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp index 29cf56e55e..7ccd7c64a9 100644 --- a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp @@ -100,7 +100,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, formatCastOfSelf.options().dtype(at::kBool)); @@ -116,7 +116,7 @@ namespace at_npu auto outputSize = input_same_output_size(formatCastOfSelf); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, formatCastOfSelf.options().dtype(at::kBool)); @@ -133,7 +133,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); @@ -161,7 +161,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp index 0f6e2b6875..54a5768348 100644 --- a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp @@ -142,7 +142,7 @@ namespace at_npu } // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options().dtype(dstType), npu_format); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp index e836bee17c..19aa9ccd83 100644 --- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp @@ -183,12 +183,12 @@ Return: if ((self.scalar_type() == at::ScalarType::Half) && !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) { - result = at::empty_with_format( + result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = at::empty_with_format(outputSize, self.options()); + result = NPUNativeFunctions::empty_with_format(outputSize, self.options()); } // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp index 7e6403e1fc..c945f876a4 100644 --- a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp @@ -114,7 +114,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(selfCast, otherCast); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -136,7 +136,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp index a04ed1ff90..204a62f337 100644 --- a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp @@ -50,7 +50,7 @@ namespace at_npu at::Tensor NPUNativeFunctions::neg(const at::Tensor &self) { // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp index 5135cd4a27..65063a67b8 100644 --- a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp @@ -182,7 +182,7 @@ namespace at_npu c10::optional pin_memory_opt) { // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( size, dtype_opt, layout_opt, device_opt, pin_memory_opt, ACL_FORMAT_ND); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp index 12dd1e7542..a4edb6cad0 100644 --- a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp @@ -45,7 +45,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format(outputSize, + at::Tensor result = NPUNativeFunctions::empty_with_format(outputSize, dtype_opt, layout_opt, device_opt, diff --git a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp index d8ab551733..410f091e60 100644 --- a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp @@ -65,7 +65,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp index 79783b2f4c..ca6f03c9c7 100644 --- a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp @@ -119,7 +119,7 @@ namespace at_npu outputSize[1] = c1_len.toInt() * 16; // construct the output at::Tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp index fe90315466..914664cd96 100644 --- a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp @@ -103,7 +103,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -120,7 +120,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp index 48129f2a6b..92012d2da5 100644 --- a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp @@ -197,7 +197,7 @@ namespace at_npu } // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options().dtype(dstType), npu_format); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp index 537bcc2444..3a4bdabee3 100644 --- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp @@ -62,7 +62,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // use 5HD in Relu diff --git a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp index 0bddf27eae..3f34261d8d 100644 --- a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp @@ -219,11 +219,11 @@ namespace at_npu // construct the output tensor of the NPU at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); auto outputSize = transpose_npu_output_size(values, perm); - at::Tensor transposeValue = at::empty_with_format( + at::Tensor transposeValue = NPUNativeFunctions::empty_with_format( outputSize, values.options(), CalcuOpUtil::get_tensor_npu_format(values)); - at::Tensor transposeIndices = at::empty_with_format( + at::Tensor transposeIndices = NPUNativeFunctions::empty_with_format( outputSize, indices.options(), CalcuOpUtil::get_tensor_npu_format(indices)); @@ -290,9 +290,9 @@ namespace at_npu // calculate the output size auto outputSize = topk_npu_output_size(selfCp, k, dim, largest, sorted); // construct the output tensor of the NPU - at::Tensor values = at::empty_with_format( + at::Tensor values = NPUNativeFunctions::empty_with_format( outputSize, selfCp.options(), CalcuOpUtil::get_tensor_npu_format(selfCp)); - at::Tensor indices = at::empty_with_format( + at::Tensor indices = NPUNativeFunctions::empty_with_format( outputSize, selfCp.options().dtype(at::kInt), ACL_FORMAT_ND); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp index fcda11d5bc..085c49f0fe 100644 --- a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp @@ -76,7 +76,7 @@ namespace at_npu { RECORD_FUNCTION("transpose_to_contiguous", vector({self})); int64_t self_format = CalcuOpUtil::get_tensor_npu_format(self); - at::Tensor result = at::empty_with_format(self.sizes(), self.options(), self_format); + at::Tensor result = NPUNativeFunctions::empty_with_format(self.sizes(), self.options(), self_format); // obtain the transpose axises at::IntArrayRef dim; diff --git a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp index 18ec3038d1..2f4775751d 100644 --- a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp @@ -56,12 +56,9 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format(outputSize, - dtype_opt, - layout_opt, - device_opt, - pin_memory_opt, - CalcuOpUtil::get_tensor_npu_format(self)); + at::Tensor result = NPUNativeFunctions::empty_with_format( + outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt, + CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU return result.zero_(); diff --git a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp index dcddfa5d65..f6eb56eca5 100644 --- a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp @@ -213,7 +213,7 @@ tuple NPUNativeFunctions::npu_conv2d_backwar at::Tensor gradBias; // construct the output tensor of the NPU if (grad_input_mask[0]) { - gradInput = at::empty_with_format( + gradInput = NPUNativeFunctions::empty_with_format( std::get<0>(outputSizes), input.options(), ACL_FORMAT_NC1HWC0); } @@ -221,12 +221,12 @@ tuple NPUNativeFunctions::npu_conv2d_backwar // For group conv2d: keep consistent with weight to avoid allreduce accuracy problem. // For more info: https://gitee.com/ascend/pytorch-develop/pulls/2255 if (groups > 1) { - gradWeight = at::empty_with_format( + gradWeight = NPUNativeFunctions::empty_with_format( std::get<1>(outputSizes), weight.options().dtype(at::kFloat), ACL_FORMAT_NCHW); } else { - gradWeight = at::empty_with_format( + gradWeight = NPUNativeFunctions::empty_with_format( std::get<1>(outputSizes), weight.options().dtype(at::kFloat), ACL_FORMAT_FRACTAL_Z); @@ -234,7 +234,7 @@ tuple NPUNativeFunctions::npu_conv2d_backwar } if (grad_input_mask[2]) { - gradBias = at::empty_with_format( + gradBias = NPUNativeFunctions::empty_with_format( std::get<2>(outputSizes), grad.options(), ACL_FORMAT_NCHW); } diff --git a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp index 48be4e0d61..f9ade9488c 100644 --- a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp @@ -107,7 +107,7 @@ tuple NPUNativeFunctions::npu_conv3d_backwar if (grad_input_mask[0]) { // format should be NDC1HWC0 - gradInput = at::empty_with_format( + gradInput = NPUNativeFunctions::empty_with_format( input.sizes(), input.options(), ACL_FORMAT_NDC1HWC0); conv3d_backward_inputmask( @@ -116,7 +116,7 @@ tuple NPUNativeFunctions::npu_conv3d_backwar if (grad_input_mask[1]) { // format should be FRACTAL_Z_3D - gradWeight = at::empty_with_format( + gradWeight = NPUNativeFunctions::empty_with_format( weight.sizes(), weight.options().dtype(at::kFloat), ACL_FRACTAL_Z_3D); conv3d_backward_weightmask( @@ -125,7 +125,7 @@ tuple NPUNativeFunctions::npu_conv3d_backwar if (grad_input_mask[2]) { // format should be NCHW, gradias.size = grad.size(1) - gradBias = at::empty_with_format( + gradBias = NPUNativeFunctions::empty_with_format( {grad.size(1)}, grad.options(), ACL_FORMAT_NCHW); conv3d_backward_biasmask( diff --git a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp index c6e2f78378..8e60437207 100644 --- a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp @@ -81,7 +81,7 @@ at::Tensor NPUNativeFunctions::npu_conv_transpose2d( // construct the output tensor of the NPU at::Tensor result = - at::empty_with_format(outputSize, input.options(), ACL_FORMAT_NC1HWC0); + NPUNativeFunctions::empty_with_format(outputSize, input.options(), ACL_FORMAT_NC1HWC0); // calculate the output result of the NPU conv_transpose2d_out_npu( diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp index 5e31d05824..e49f4ba62e 100644 --- a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp @@ -91,7 +91,7 @@ at::Tensor NPUNativeFunctions::nll_loss_backward( auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor grad_input = at::empty_with_format( + at::Tensor grad_input = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp index cfda03265f..418b2b296a 100644 --- a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp @@ -96,11 +96,11 @@ tuple NPUNativeFunctions::nll_loss_forward( outputSize, totalWeightSize); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( std::get<0>(outputSizes), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - at::Tensor total_weight = at::empty_with_format( + at::Tensor total_weight = NPUNativeFunctions::empty_with_format( std::get<1>(outputSizes), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp index b31bd008d0..5c2b3b4ab7 100644 --- a/torch_npu/csrc/distributed/reducer.cpp +++ b/torch_npu/csrc/distributed/reducer.cpp @@ -31,6 +31,7 @@ #include #include "torch_npu/csrc/distributed/reducer.hpp" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace c10d_npu { namespace { @@ -1072,9 +1073,9 @@ void Reducer::copy_bucket_to_grad( if (!grad.defined()) { // Creates grad according to the "Gradient Layout Contract" // (see torch/csrc/grad/AccumulateGrad.h) - grad = at::empty_with_format(variable.sizes(), - bucket_view.options(), - variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); + grad = NPUNativeFunctions::empty_with_format( + variable.sizes(), bucket_view.options(), + variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); grad.copy_memory_(bucket_view, true); } else { grad.copy_memory_(bucket_view, true); diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp index 0854e27c09..9f918437bb 100644 --- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp +++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp @@ -109,7 +109,7 @@ namespace at_npu const at::Tensor &src, const std::vector &optimizations) { - auto self = at::empty_with_format( + auto self = NPUNativeFunctions::empty_with_format( src.sizes(), src.options(), src.storage().get_npu_desc().npu_format_); diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp index cf270817f8..a0228c78ca 100644 --- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp @@ -16,8 +16,10 @@ #include #include #include + #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace at_npu { @@ -498,7 +500,7 @@ namespace at_npu { // case 2: The first tensor is discontiguous-type, // conduct the standard optimization procedure. - auto contiguous_src = at::empty_with_format( + auto contiguous_src = NPUNativeFunctions::empty_with_format( src.sizes(), src.options(), src.storage().get_npu_desc().npu_format_); diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp index a0efe852f7..b1c3a10a77 100644 --- a/torch_npu/csrc/framework/utils/NpuUtils.cpp +++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp @@ -16,7 +16,6 @@ #include #include -#include #include "torch_npu/csrc/framework/utils/NpuUtils.h" #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" @@ -25,6 +24,7 @@ #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/interface/EnvVariables.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace at_npu { @@ -165,7 +165,7 @@ namespace at_npu // 3. get output size auto outputSize = index_select_npu_output_size(src_tmp, dim, index); int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(src_tmp); - at::Tensor result = at::empty_with_format(outputSize, src_tmp.options(), npu_format); + at::Tensor result = NPUNativeFunctions::empty_with_format(outputSize, src_tmp.options(), npu_format); // std::cout << "npu_format: " << npu_format << std::endl; // 4. get input and output @@ -208,7 +208,7 @@ namespace at_npu at::Tensor deal_with_5d_5d_match(const at::Tensor &src) { auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - at::Tensor src_new = at::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); + at::Tensor src_new = NPUNativeFunctions::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream(); int64_t numel = src_new.numel(); aclError error = aclrtMemcpyAsync( diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index bf70fb7277..be278c096f 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -17,6 +17,8 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/InferFormat.h" #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + namespace at_npu { @@ -234,13 +236,13 @@ namespace at_npu at::Tensor OpPreparation::ApplyTensorWithFormat(c10::IntArrayRef sizes, const c10::TensorOptions &options, int64_t format) { auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format); - return at::empty_with_format(sizes, options, fixFormat); + return NPUNativeFunctions::empty_with_format(sizes, options, fixFormat); } at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10::TensorOptions &options) { auto format = InferFormat::GuessBaseFormat(sizes); - return at::empty_with_format(sizes, options, format); + return NPUNativeFunctions::empty_with_format(sizes, options, format); } void OpPreparation::CheckMemory(const std::initializer_list &inputs, const std::initializer_list &outputs) -- Gitee From 6c29a7d5c7eaa021e9c223c7dc3e75d50589fcf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Sat, 12 Feb 2022 11:37:56 +0800 Subject: [PATCH 03/12] Update npu_format_cast. --- torch_npu/csrc/aten/common/CopyKernel.cpp | 6 +++--- torch_npu/csrc/aten/common/NpuFastReshape.cpp | 2 +- torch_npu/csrc/aten/common/ResizeNpu.cpp | 2 +- torch_npu/csrc/aten/ops/NormalKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp | 2 +- torch_npu/csrc/distributed/Init.cpp | 2 +- torch_npu/csrc/distributed/reducer.cpp | 2 +- torch_npu/csrc/framework/utils/NpuUtils.cpp | 2 +- torch_npu/csrc/framework/utils/OpPreparation.cpp | 4 ++-- torch_npu/testing/util_test.py | 5 +++-- 10 files changed, 15 insertions(+), 14 deletions(-) diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index 5025e3accd..bb983a63d3 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -188,7 +188,7 @@ void copy_d2d_dtype_format(at::Tensor& self, const at::Tensor& src, bool non_blo at::Tensor src_4D = FormatCastHelper::ApplyBaseFormatTensorBy(src); at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self); copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking); - self.npu_format_cast_(dst_4D); + NPUNativeFunctions::npu_format_cast_(self, dst_4D); return; } copy_d2d_dtype_baseformat(self, src, non_blocking); @@ -312,7 +312,7 @@ void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) { if (!FormatHelper::IsBaseFormatType(self)) { at::Tensor dst = OpPreparation::ApplyTensor(self); copy_h2d_baseformat(dst, src, non_blocking, true); - self.npu_format_cast_(dst); + NPUNativeFunctions::npu_format_cast_(dst); return; } copy_h2d_baseformat(self, src, non_blocking); @@ -363,7 +363,7 @@ void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking) } at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self); copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking); - self.npu_format_cast_(dst_4D); + NPUNativeFunctions::npu_format_cast_(dst_4D); return; } copy_d2d_dtype_format(self, src, non_blocking); diff --git a/torch_npu/csrc/aten/common/NpuFastReshape.cpp b/torch_npu/csrc/aten/common/NpuFastReshape.cpp index e81d4f0c29..9fc817bfe3 100644 --- a/torch_npu/csrc/aten/common/NpuFastReshape.cpp +++ b/torch_npu/csrc/aten/common/NpuFastReshape.cpp @@ -45,7 +45,7 @@ void npu_fast_reshape_(at::Tensor& tensor) { // refresh matadata to input tensor StorageDescHelper::ReflushDescBySelf(tensor); auto base_format = InferFormat::GuessBaseFormat(tensor.sizes()); - tensor.npu_format_cast_(base_format); + NPUNativeFunctions::npu_format_cast_(tensor, base_format); } } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/aten/common/ResizeNpu.cpp b/torch_npu/csrc/aten/common/ResizeNpu.cpp index e05736bf2e..35faadb6f2 100644 --- a/torch_npu/csrc/aten/common/ResizeNpu.cpp +++ b/torch_npu/csrc/aten/common/ResizeNpu.cpp @@ -31,7 +31,7 @@ at::Tensor& NPUNativeFunctions::resize_( // because of resize _impl_npu_ only support at base format, so // no need to reflush NpuStorageDesc here. if (!FormatHelper::IsBaseFormatType(self)) { - self.npu_format_cast_(FormatHelper::GetBaseFormat(self)); + NPUNativeFunctions::npu_format_cast_(self, FormatHelper::GetBaseFormat(self)); } auto* self_ = self.unsafeGetTensorImpl(); resize_impl_npu_(self_, size, /*strides=*/c10::nullopt); diff --git a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp index 65063a67b8..e8bfa33690 100644 --- a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp @@ -118,7 +118,7 @@ namespace at_npu TORCH_CHECK(std > 0.0, "normal_ expects std > 0.0, but found std=", std); // the op of PTNormalFloatFloat only support format of ND - at::Tensor formatCastOfResult = result.npu_format_cast(ACL_FORMAT_ND); + at::Tensor formatCastOfResult = NPUNativeFunctions::npu_format_cast(result, ACL_FORMAT_ND); if (formatCastOfResult.scalar_type() == at::ScalarType::Half) { formatCastOfResult = formatCastOfResult.to(at::ScalarType::Float); diff --git a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp index cdbd32be1d..968e2a6419 100644 --- a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp @@ -55,7 +55,7 @@ namespace at_npu // output'format must be same with grad_output if (CalcuOpUtil::get_tensor_npu_format(output) != CalcuOpUtil::get_tensor_npu_format(grad_output)) { - output.npu_format_cast_(CalcuOpUtil::get_tensor_npu_format(grad_output)); + NPUNativeFunctions::npu_format_cast_(output, CalcuOpUtil::get_tensor_npu_format(grad_output)); } // construct the output tensor of the NPU diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 951ba59607..621ce5962a 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -48,7 +48,7 @@ class BroadcastWork { public: inline std::vector cast_tensors(at::TensorList tensors) { static auto cast_back_to_ori_format = [](const at::Tensor &t) { - return t.npu_format_cast(t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); + return NPUNativeFunctions::npu_format_cast(t, t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); }; return c10::fmap(tensors, cast_back_to_ori_format); } diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp index 5c2b3b4ab7..f9fe523862 100644 --- a/torch_npu/csrc/distributed/reducer.cpp +++ b/torch_npu/csrc/distributed/reducer.cpp @@ -442,7 +442,7 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) { // make sure grad has the same format as variable if (grad.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ != variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_) { - grad = grad.npu_format_cast( + grad = NPUNativeFunctions::npu_format_cast(grad, variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); } this->copy_grad_to_bucket(grad, bucket_view); diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp index b1c3a10a77..f10640a1ec 100644 --- a/torch_npu/csrc/framework/utils/NpuUtils.cpp +++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp @@ -252,7 +252,7 @@ namespace at_npu // a temporary tensor, which always monopolizes its own storage. if (numelEq && (!FormatHelper::IsBaseFormatType(src))) { - at::Tensor tempTensor = at::npu_format_cast(src, FormatHelper::GetBaseFormat(src)); + at::Tensor tempTensor = NPUNativeFunctions::npu_format_cast(src, FormatHelper::GetBaseFormat(src)); auto &temp_desc = tempTensor.storage().unsafeGetStorageImpl()->npu_desc_; temp_desc.base_sizes_ = tempTensor.sizes(); diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index be278c096f..1001fa0167 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -178,13 +178,13 @@ namespace at_npu if (output.scalar_type() == at::ScalarType::Float || output.scalar_type() == at::ScalarType::Half) { TORCH_CHECK(!is_read_write, "can not cast format when output is input"); - output.npu_format_cast_(format); + NPUNativeFunctions::npu_format_cast_(output, format); } else { TORCH_CHECK(FormatHelper::IsBaseFormatType(output) && FormatHelper::IsBaseFormatType(static_cast(format)), "can not cast format to un-base format when output has bool dtype"); - output.npu_format_cast_(format); + NPUNativeFunctions::npu_format_cast_(output, format); } } } diff --git a/torch_npu/testing/util_test.py b/torch_npu/testing/util_test.py index a460af4ae3..835814c30d 100644 --- a/torch_npu/testing/util_test.py +++ b/torch_npu/testing/util_test.py @@ -15,6 +15,7 @@ # limitations under the License. import torch +import torch_npu import numpy as np import os @@ -42,7 +43,7 @@ def create_common_tensor(item, minValue, maxValue): cpu_input = torch.from_numpy(input1) npu_input = torch.from_numpy(input1).to(npu_device) if npu_format != -1: - npu_input = npu_input.npu_format_cast(npu_format) + npu_input = torch_npu.npu_format_cast(npu_input, npu_format) return cpu_input, npu_input @@ -125,5 +126,5 @@ def create_dtype_tensor(shape, dtype, npu_format=-1, min_value=-5, max_value=5, cpu_input = torch.from_numpy(x) npu_input = torch.from_numpy(x).to(npu_device) if npu_format != -1 and (dtype in [torch.float, torch.half]): - npu_input = npu_input.npu_format_cast(npu_format) + npu_input = torch_npu.npu_format_cast(npu_input, npu_format) return cpu_input, npu_input \ No newline at end of file -- Gitee From 88ca0d29ab0e2f0506507e56e0076a07e1c4e0d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Sat, 12 Feb 2022 11:41:16 +0800 Subject: [PATCH 04/12] Update copy_memory_. --- torch_npu/csrc/aten/common/FormatCastHelper.cpp | 2 +- torch_npu/csrc/distributed/reducer.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp index aa98978e8a..2ee080d215 100644 --- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp +++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp @@ -28,7 +28,7 @@ bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor& void FormatCastHelper::base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src) { dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides()); - dst.copy_memory_(src, true); + NPUNativeFunctions::copy_memory_(dst, src, true); } void FormatCastHelper::format_cast_as_base_format(const at::Tensor& src, aclFormat format) { diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp index f9fe523862..4a29c1ce45 100644 --- a/torch_npu/csrc/distributed/reducer.cpp +++ b/torch_npu/csrc/distributed/reducer.cpp @@ -407,9 +407,9 @@ void Reducer::copy_grad_to_bucket( if (comm_hook_ == nullptr) { // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp // Divides while copying into the bucket view. - bucket_view.copy_memory_(grad.mul(float(1.) / divFactor_), true); + NPUNativeFunctions::copy_memory_(bucket_view, grad.mul(float(1.) / divFactor_), true); } else { - bucket_view.copy_memory_(grad, true); + NPUNativeFunctions::copy_memory_(bucket_view, grad, true); } } @@ -1076,9 +1076,9 @@ void Reducer::copy_bucket_to_grad( grad = NPUNativeFunctions::empty_with_format( variable.sizes(), bucket_view.options(), variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); - grad.copy_memory_(bucket_view, true); + NPUNativeFunctions::copy_memory_(grad, bucket_view, true); } else { - grad.copy_memory_(bucket_view, true); + NPUNativeFunctions::copy_memory_(grad, bucket_view, true); } // The grad is modified and needs to be written back. return true; -- Gitee From 4e1d35f0abc69814d3537647dc6b4285e7406df6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Sat, 12 Feb 2022 15:57:44 +0800 Subject: [PATCH 05/12] Replace empty_with_format with ApplyTensor. --- .../PyTorch Operator Development Guide.md | 5 +++-- ...\274\200\345\217\221\346\214\207\345\215\227.md" | 4 ++-- torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/common/TensorFactories.cpp | 10 +++++----- torch_npu/csrc/aten/ops/AddKernelNpu.cpp | 6 +++--- torch_npu/csrc/aten/ops/AnyKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/BmmKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/DivKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/EqKernelNpu.cpp | 8 ++++---- torch_npu/csrc/aten/ops/GtKernelNpu.cpp | 8 ++++---- torch_npu/csrc/aten/ops/LtKernelNpu.cpp | 8 ++++---- torch_npu/csrc/aten/ops/MeanKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/MmKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/MulKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/NegKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp | 9 +++------ torch_npu/csrc/aten/ops/ReluKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/SubKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/SumKernelNpu.cpp | 2 +- .../csrc/aten/ops/ThresholdBackwardKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/TopKKernelNpu.cpp | 8 ++++---- torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp | 2 +- .../ops/convolution/Conv2dBackwardKernelNpu.cpp | 8 ++++---- .../ops/convolution/Conv3dBackwardKernelNpu.cpp | 6 +++--- .../ops/convolution/ConvTranspose2dKernelNpu.cpp | 2 +- .../csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp | 4 ++-- torch_npu/csrc/distributed/Init.cpp | 3 ++- torch_npu/csrc/distributed/reducer.cpp | 13 +++++++------ .../csrc/framework/contiguous/ContiguousOpt.cpp | 3 ++- .../csrc/framework/contiguous/combined_opt.cpp | 6 +++--- torch_npu/csrc/framework/utils/NpuUtils.cpp | 5 +++-- torch_npu/csrc/framework/utils/OpPreparation.cpp | 8 ++++++-- 35 files changed, 88 insertions(+), 82 deletions(-) diff --git a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md index 698b2ac1c7..9ca63313af 100644 --- a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md +++ b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md @@ -524,7 +524,8 @@ The following uses the torch.add\(\) operator as an example to describe how to a auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = (self, outputSize, npu_format); + Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -541,7 +542,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a // calculate the output size auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - Tensor result = NPUNativeFunctions::empty_with_format( + Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" index d38aa6efd3..e2f6a2c9fa 100644 --- "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -524,7 +524,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - Tensor result = NPUNativeFunctions::empty_with_format( + Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -541,7 +541,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 // calculate the output size auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - Tensor result = NPUNativeFunctions::empty_with_format( + Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp index 506686f3c6..c518156b0a 100644 --- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp @@ -76,7 +76,7 @@ at::Tensor NPUNativeFunctions::npu_format_cast( TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half, "can not cast format when src is not float32 or float16"); - at::Tensor dst = NPUNativeFunctions::empty_with_format( + at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, src.options(), acl_format); // calculate the output result of the NPU @@ -105,7 +105,7 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_( TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half, "can not cast format when src is not float32 or float16"); - at::Tensor dst = NPUNativeFunctions::empty_with_format( + at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, src.options(), acl_format); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp index fdb4634709..ac32cb49bb 100644 --- a/torch_npu/csrc/aten/common/TensorFactories.cpp +++ b/torch_npu/csrc/aten/common/TensorFactories.cpp @@ -231,7 +231,7 @@ namespace at_npu { auto npu_format = self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_; - result = NPUNativeFunctions::empty_with_format(self.sizes(), self.options(), npu_format); + result = OpPreparation::ApplyTensorWithFormat(self.sizes(), self.options(), npu_format); } } @@ -348,7 +348,7 @@ namespace at_npu options.layout(layout_opt); options.pinned_memory(pin_memory_opt); at::Tensor result = - NPUNativeFunctions::empty_with_format(size, options, dst_format); + OpPreparation::ApplyTensorWithFormat(size, options, dst_format); if (names.has_value()) { internal_set_names_inplace(result, names); @@ -362,7 +362,7 @@ namespace at_npu int64_t dst_format) { at::Tensor result = - NPUNativeFunctions::empty_with_format(size, options, dst_format); + OpPreparation::ApplyTensorWithFormat(size, options, dst_format); if (names.has_value()) { internal_set_names_inplace(result, names); @@ -377,7 +377,7 @@ namespace at_npu int64_t dst_format) { at::Tensor result = - NPUNativeFunctions::empty_with_format(size, options, dst_format); + OpPreparation::ApplyTensorWithFormat(size, options, dst_format); if (names.has_value()) { internal_set_names_inplace(result, names); @@ -628,7 +628,7 @@ namespace at_npu AT_ASSERT(result.is_contiguous()); AT_DISPATCH_ALL_TYPES_AND_COMPLEX(result.scalar_type(), "tensor_npu", [&] { std::copy( - values.begin(), values.end(), result.template data_ptr()); }); + values.begin(), values.end(), result.template data_ptr()); }); return result; } diff --git a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp index 08793a06e7..4d4fa126fb 100644 --- a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp @@ -170,7 +170,7 @@ namespace at_npu else { c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - at::Tensor src_new = NPUNativeFunctions::empty_with_format( + at::Tensor src_new = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); src_new.set_( src.storage(), @@ -206,7 +206,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -223,7 +223,7 @@ namespace at_npu // calculate the output size auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp index 63b1271f93..c7a287bdab 100644 --- a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp @@ -70,7 +70,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self, int64_t dim, bool kee auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU @@ -88,7 +88,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self) { // when self's dim = 0, convert [1] tensor and reduce it if (self.dim() == 0) { at::Tensor self_tmp = self; - self_tmp = NPUNativeFunctions::empty_with_format( + self_tmp = OpPreparation::ApplyTensorWithFormat( {1}, self.options().dtype(at::ScalarType::Float), CalcuOpUtil::get_tensor_npu_format(self)) diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp index c93fbe3cc8..d1219aec6a 100644 --- a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp @@ -73,9 +73,9 @@ at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去 if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) && !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) { - result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); + result = OpPreparation::ApplyTensorWithFormat(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND); + result = OpPreparation::ApplyTensorWithFormat(outputSize, self.options(), ACL_FORMAT_ND); } // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp index 5903959bab..f63a18da30 100644 --- a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp @@ -53,7 +53,7 @@ namespace at_npu input = input.to(at::kInt); } - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( size, input.options(), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp index 76865efccb..b7ca932113 100644 --- a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp @@ -85,7 +85,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -102,7 +102,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp index 90b74193dc..fa27bc376a 100644 --- a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp @@ -95,7 +95,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -113,7 +113,7 @@ namespace at_npu auto outputSize = input_same_output_size(formatCastOfSelf); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -131,7 +131,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(c10::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); @@ -159,7 +159,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(c10::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp index d27161e162..4b453091b6 100644 --- a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp @@ -104,7 +104,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -121,7 +121,7 @@ namespace at_npu auto outputSize = input_same_output_size(formatCastOfSelf); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -139,7 +139,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); @@ -167,7 +167,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp index 7ccd7c64a9..c7733a3ff5 100644 --- a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp @@ -100,7 +100,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensor( outputSize, formatCastOfSelf.options().dtype(at::kBool)); @@ -116,7 +116,7 @@ namespace at_npu auto outputSize = input_same_output_size(formatCastOfSelf); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensor( outputSize, formatCastOfSelf.options().dtype(at::kBool)); @@ -133,7 +133,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); @@ -161,7 +161,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp index 54a5768348..8ca851a36d 100644 --- a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp @@ -142,7 +142,7 @@ namespace at_npu } // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options().dtype(dstType), npu_format); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp index 19aa9ccd83..0dafb37712 100644 --- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp @@ -183,12 +183,12 @@ Return: if ((self.scalar_type() == at::ScalarType::Half) && !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) { - result = NPUNativeFunctions::empty_with_format( + result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = NPUNativeFunctions::empty_with_format(outputSize, self.options()); + result = OpPreparation::ApplyTensor(outputSize, self.options()); } // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp index c945f876a4..e6428df775 100644 --- a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp @@ -114,7 +114,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(selfCast, otherCast); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -136,7 +136,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp index 204a62f337..a3ae15a640 100644 --- a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp @@ -50,7 +50,7 @@ namespace at_npu at::Tensor NPUNativeFunctions::neg(const at::Tensor &self) { // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp index a4edb6cad0..8343864e43 100644 --- a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp @@ -45,12 +45,9 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format(outputSize, - dtype_opt, - layout_opt, - device_opt, - pin_memory_opt, - CalcuOpUtil::get_tensor_npu_format(self)); + at::Tensor result = NPUNativeFunctions::empty_with_format( + outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt, + CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPUc return NPUNativeFunctions::one_(result); } diff --git a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp index 410f091e60..adf42ed6b9 100644 --- a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp @@ -65,7 +65,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp index ca6f03c9c7..9256fcc6c9 100644 --- a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp @@ -119,7 +119,7 @@ namespace at_npu outputSize[1] = c1_len.toInt() * 16; // construct the output at::Tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp index 914664cd96..33ac4018ee 100644 --- a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp @@ -103,7 +103,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -120,7 +120,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp index 92012d2da5..23819bc687 100644 --- a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp @@ -197,7 +197,7 @@ namespace at_npu } // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options().dtype(dstType), npu_format); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp index 3a4bdabee3..0bc0118a9c 100644 --- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp @@ -62,7 +62,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // use 5HD in Relu diff --git a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp index 3f34261d8d..9a4bd8fb5a 100644 --- a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp @@ -219,11 +219,11 @@ namespace at_npu // construct the output tensor of the NPU at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); auto outputSize = transpose_npu_output_size(values, perm); - at::Tensor transposeValue = NPUNativeFunctions::empty_with_format( + at::Tensor transposeValue = OpPreparation::ApplyTensorWithFormat( outputSize, values.options(), CalcuOpUtil::get_tensor_npu_format(values)); - at::Tensor transposeIndices = NPUNativeFunctions::empty_with_format( + at::Tensor transposeIndices = OpPreparation::ApplyTensorWithFormat( outputSize, indices.options(), CalcuOpUtil::get_tensor_npu_format(indices)); @@ -290,9 +290,9 @@ namespace at_npu // calculate the output size auto outputSize = topk_npu_output_size(selfCp, k, dim, largest, sorted); // construct the output tensor of the NPU - at::Tensor values = NPUNativeFunctions::empty_with_format( + at::Tensor values = OpPreparation::ApplyTensorWithFormat( outputSize, selfCp.options(), CalcuOpUtil::get_tensor_npu_format(selfCp)); - at::Tensor indices = NPUNativeFunctions::empty_with_format( + at::Tensor indices = OpPreparation::ApplyTensorWithFormat( outputSize, selfCp.options().dtype(at::kInt), ACL_FORMAT_ND); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp index 085c49f0fe..40631abc0b 100644 --- a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp @@ -76,7 +76,7 @@ namespace at_npu { RECORD_FUNCTION("transpose_to_contiguous", vector({self})); int64_t self_format = CalcuOpUtil::get_tensor_npu_format(self); - at::Tensor result = NPUNativeFunctions::empty_with_format(self.sizes(), self.options(), self_format); + at::Tensor result = OpPreparation::ApplyTensorWithFormat(self.sizes(), self.options(), self_format); // obtain the transpose axises at::IntArrayRef dim; diff --git a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp index f6eb56eca5..b93bf1710d 100644 --- a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp @@ -213,7 +213,7 @@ tuple NPUNativeFunctions::npu_conv2d_backwar at::Tensor gradBias; // construct the output tensor of the NPU if (grad_input_mask[0]) { - gradInput = NPUNativeFunctions::empty_with_format( + gradInput = OpPreparation::ApplyTensorWithFormat( std::get<0>(outputSizes), input.options(), ACL_FORMAT_NC1HWC0); } @@ -221,12 +221,12 @@ tuple NPUNativeFunctions::npu_conv2d_backwar // For group conv2d: keep consistent with weight to avoid allreduce accuracy problem. // For more info: https://gitee.com/ascend/pytorch-develop/pulls/2255 if (groups > 1) { - gradWeight = NPUNativeFunctions::empty_with_format( + gradWeight = OpPreparation::ApplyTensorWithFormat( std::get<1>(outputSizes), weight.options().dtype(at::kFloat), ACL_FORMAT_NCHW); } else { - gradWeight = NPUNativeFunctions::empty_with_format( + gradWeight = OpPreparation::ApplyTensorWithFormat( std::get<1>(outputSizes), weight.options().dtype(at::kFloat), ACL_FORMAT_FRACTAL_Z); @@ -234,7 +234,7 @@ tuple NPUNativeFunctions::npu_conv2d_backwar } if (grad_input_mask[2]) { - gradBias = NPUNativeFunctions::empty_with_format( + gradBias = OpPreparation::ApplyTensorWithFormat( std::get<2>(outputSizes), grad.options(), ACL_FORMAT_NCHW); } diff --git a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp index f9ade9488c..b38ef864a0 100644 --- a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp @@ -107,7 +107,7 @@ tuple NPUNativeFunctions::npu_conv3d_backwar if (grad_input_mask[0]) { // format should be NDC1HWC0 - gradInput = NPUNativeFunctions::empty_with_format( + gradInput = OpPreparation::ApplyTensorWithFormat( input.sizes(), input.options(), ACL_FORMAT_NDC1HWC0); conv3d_backward_inputmask( @@ -116,7 +116,7 @@ tuple NPUNativeFunctions::npu_conv3d_backwar if (grad_input_mask[1]) { // format should be FRACTAL_Z_3D - gradWeight = NPUNativeFunctions::empty_with_format( + gradWeight = OpPreparation::ApplyTensorWithFormat( weight.sizes(), weight.options().dtype(at::kFloat), ACL_FRACTAL_Z_3D); conv3d_backward_weightmask( @@ -125,7 +125,7 @@ tuple NPUNativeFunctions::npu_conv3d_backwar if (grad_input_mask[2]) { // format should be NCHW, gradias.size = grad.size(1) - gradBias = NPUNativeFunctions::empty_with_format( + gradBias = OpPreparation::ApplyTensorWithFormat( {grad.size(1)}, grad.options(), ACL_FORMAT_NCHW); conv3d_backward_biasmask( diff --git a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp index 8e60437207..a0a32368fb 100644 --- a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp @@ -81,7 +81,7 @@ at::Tensor NPUNativeFunctions::npu_conv_transpose2d( // construct the output tensor of the NPU at::Tensor result = - NPUNativeFunctions::empty_with_format(outputSize, input.options(), ACL_FORMAT_NC1HWC0); + OpPreparation::ApplyTensorWithFormat(outputSize, input.options(), ACL_FORMAT_NC1HWC0); // calculate the output result of the NPU conv_transpose2d_out_npu( diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp index e49f4ba62e..bf6da61f93 100644 --- a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp @@ -91,7 +91,7 @@ at::Tensor NPUNativeFunctions::nll_loss_backward( auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor grad_input = NPUNativeFunctions::empty_with_format( + at::Tensor grad_input = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp index 418b2b296a..f274745813 100644 --- a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp @@ -96,11 +96,11 @@ tuple NPUNativeFunctions::nll_loss_forward( outputSize, totalWeightSize); // construct the output tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( std::get<0>(outputSizes), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - at::Tensor total_weight = NPUNativeFunctions::empty_with_format( + at::Tensor total_weight = OpPreparation::ApplyTensorWithFormat( std::get<1>(outputSizes), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 621ce5962a..e8717648c5 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -33,6 +33,7 @@ #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp" #include "torch_npu/csrc/distributed/Init.h" #include "torch_npu/csrc/distributed/reducer.hpp" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace torch_npu { @@ -48,7 +49,7 @@ class BroadcastWork { public: inline std::vector cast_tensors(at::TensorList tensors) { static auto cast_back_to_ori_format = [](const at::Tensor &t) { - return NPUNativeFunctions::npu_format_cast(t, t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); + return at_npu::native::NPUNativeFunctions::npu_format_cast(t, t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); }; return c10::fmap(tensors, cast_back_to_ori_format); } diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp index 4a29c1ce45..5e9721944b 100644 --- a/torch_npu/csrc/distributed/reducer.cpp +++ b/torch_npu/csrc/distributed/reducer.cpp @@ -32,6 +32,7 @@ #include "torch_npu/csrc/distributed/reducer.hpp" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" namespace c10d_npu { namespace { @@ -407,9 +408,9 @@ void Reducer::copy_grad_to_bucket( if (comm_hook_ == nullptr) { // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp // Divides while copying into the bucket view. - NPUNativeFunctions::copy_memory_(bucket_view, grad.mul(float(1.) / divFactor_), true); + at_npu::native::NPUNativeFunctions::copy_memory_(bucket_view, grad.mul(float(1.) / divFactor_), true); } else { - NPUNativeFunctions::copy_memory_(bucket_view, grad, true); + at_npu::native::NPUNativeFunctions::copy_memory_(bucket_view, grad, true); } } @@ -442,7 +443,7 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) { // make sure grad has the same format as variable if (grad.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ != variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_) { - grad = NPUNativeFunctions::npu_format_cast(grad, + grad = at_npu::native::NPUNativeFunctions::npu_format_cast(grad, variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); } this->copy_grad_to_bucket(grad, bucket_view); @@ -1073,12 +1074,12 @@ void Reducer::copy_bucket_to_grad( if (!grad.defined()) { // Creates grad according to the "Gradient Layout Contract" // (see torch/csrc/grad/AccumulateGrad.h) - grad = NPUNativeFunctions::empty_with_format( + grad = OpPreparation::ApplyTensorWithFormat( variable.sizes(), bucket_view.options(), variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); - NPUNativeFunctions::copy_memory_(grad, bucket_view, true); + at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true); } else { - NPUNativeFunctions::copy_memory_(grad, bucket_view, true); + at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true); } // The grad is modified and needs to be written back. return true; diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp index 9f918437bb..efc9671da6 100644 --- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp +++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp @@ -14,6 +14,7 @@ // limitations under the License. #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" namespace at_npu { @@ -109,7 +110,7 @@ namespace at_npu const at::Tensor &src, const std::vector &optimizations) { - auto self = NPUNativeFunctions::empty_with_format( + auto self = OpPreparation::ApplyTensorWithFormat( src.sizes(), src.options(), src.storage().get_npu_desc().npu_format_); diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp index a0228c78ca..7614fb0075 100644 --- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp @@ -19,7 +19,7 @@ #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" namespace at_npu { @@ -389,7 +389,7 @@ namespace at_npu // baseInfo = inferred info(infer_size, infer_stride, infer_offset) // If the first inferred tensor can be optimized, store its info. if (can_infer_view_tensor( - src, temp_src, infer_size, infer_stride, infer_offset) && + src, temp_src, infer_size, infer_stride, infer_offset) && emplace_info( temp_src, view_infos, view_offsets, infer_offset, max_len)) { @@ -500,7 +500,7 @@ namespace at_npu { // case 2: The first tensor is discontiguous-type, // conduct the standard optimization procedure. - auto contiguous_src = NPUNativeFunctions::empty_with_format( + auto contiguous_src = OpPreparation::ApplyTensorWithFormat( src.sizes(), src.options(), src.storage().get_npu_desc().npu_format_); diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp index f10640a1ec..07bda7c6a6 100644 --- a/torch_npu/csrc/framework/utils/NpuUtils.cpp +++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp @@ -25,6 +25,7 @@ #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/interface/EnvVariables.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" namespace at_npu { @@ -165,7 +166,7 @@ namespace at_npu // 3. get output size auto outputSize = index_select_npu_output_size(src_tmp, dim, index); int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(src_tmp); - at::Tensor result = NPUNativeFunctions::empty_with_format(outputSize, src_tmp.options(), npu_format); + at::Tensor result = OpPreparation::ApplyTensorWithFormat(outputSize, src_tmp.options(), npu_format); // std::cout << "npu_format: " << npu_format << std::endl; // 4. get input and output @@ -208,7 +209,7 @@ namespace at_npu at::Tensor deal_with_5d_5d_match(const at::Tensor &src) { auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - at::Tensor src_new = NPUNativeFunctions::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); + at::Tensor src_new = OpPreparation::ApplyTensorWithFormat(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream(); int64_t numel = src_new.numel(); aclError error = aclrtMemcpyAsync( diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index 1001fa0167..d17c46d3b0 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -236,13 +236,17 @@ namespace at_npu at::Tensor OpPreparation::ApplyTensorWithFormat(c10::IntArrayRef sizes, const c10::TensorOptions &options, int64_t format) { auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format); - return NPUNativeFunctions::empty_with_format(sizes, options, fixFormat); + return NPUNativeFunctions::empty_with_format( + sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), + options.device_opt(), options.pinned_memory_opt(), fixFormat); } at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10::TensorOptions &options) { auto format = InferFormat::GuessBaseFormat(sizes); - return NPUNativeFunctions::empty_with_format(sizes, options, format); + return NPUNativeFunctions::empty_with_format( + sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), + options.device_opt(), options.pinned_memory_opt(), fixFormat); } void OpPreparation::CheckMemory(const std::initializer_list &inputs, const std::initializer_list &outputs) -- Gitee From c374454b4b6a738049f4831a10723f738b6cf192 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Sat, 12 Feb 2022 18:22:07 +0800 Subject: [PATCH 06/12] Fix modification for const vector. --- torch_npu/csrc/aten/common/CopyKernel.cpp | 4 ++-- torch_npu/csrc/aten/common/FormatCastHelper.cpp | 2 +- torch_npu/csrc/aten/common/FormatCastHelper.h | 2 +- torch_npu/csrc/aten/common/NpuFastReshape.cpp | 1 + torch_npu/csrc/aten/ops/LtKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/MmKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/ReluKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp | 7 ++++--- torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp | 1 + torch_npu/csrc/distributed/reducer.cpp | 2 +- torch_npu/csrc/framework/utils/OpPreparation.cpp | 4 ++-- 11 files changed, 17 insertions(+), 14 deletions(-) diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index bb983a63d3..171ab3f9c9 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -312,7 +312,7 @@ void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) { if (!FormatHelper::IsBaseFormatType(self)) { at::Tensor dst = OpPreparation::ApplyTensor(self); copy_h2d_baseformat(dst, src, non_blocking, true); - NPUNativeFunctions::npu_format_cast_(dst); + NPUNativeFunctions::npu_format_cast_(self, dst); return; } copy_h2d_baseformat(self, src, non_blocking); @@ -363,7 +363,7 @@ void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking) } at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self); copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking); - NPUNativeFunctions::npu_format_cast_(dst_4D); + NPUNativeFunctions::npu_format_cast_(self, dst_4D); return; } copy_d2d_dtype_format(self, src, non_blocking); diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp index 2ee080d215..13d82c3f5f 100644 --- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp +++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp @@ -26,7 +26,7 @@ bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor& return FormatHelper::GetBaseFormat(src_format) == FormatHelper::GetBaseFormat(dst_format); } -void FormatCastHelper::base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src) { +void FormatCastHelper::base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src) { dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides()); NPUNativeFunctions::copy_memory_(dst, src, true); } diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.h b/torch_npu/csrc/aten/common/FormatCastHelper.h index 91e9b78182..ea2b6ab507 100644 --- a/torch_npu/csrc/aten/common/FormatCastHelper.h +++ b/torch_npu/csrc/aten/common/FormatCastHelper.h @@ -33,7 +33,7 @@ public: static at::Tensor& CovertSelfToBaseFormat(at::Tensor& src); private: // help function of format_cast_between_group - static void base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src); + static void base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src); }; // class FormatCastHelper } // namespace native diff --git a/torch_npu/csrc/aten/common/NpuFastReshape.cpp b/torch_npu/csrc/aten/common/NpuFastReshape.cpp index 9fc817bfe3..e7df99dc6a 100644 --- a/torch_npu/csrc/aten/common/NpuFastReshape.cpp +++ b/torch_npu/csrc/aten/common/NpuFastReshape.cpp @@ -17,6 +17,7 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/InferFormat.h" #include "torch_npu/csrc/framework/StorageDescHelper.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace at_npu { namespace native { diff --git a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp index c7733a3ff5..1503d75a52 100644 --- a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp @@ -100,7 +100,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); // construct the output tensor of the NPU - at::Tensor result = OpPreparation::ApplyTensor( + at::Tensor result = OpPreparation::ApplyTensorWithSizes( outputSize, formatCastOfSelf.options().dtype(at::kBool)); @@ -116,7 +116,7 @@ namespace at_npu auto outputSize = input_same_output_size(formatCastOfSelf); // construct the output tensor of the NPU - at::Tensor result = OpPreparation::ApplyTensor( + at::Tensor result = OpPreparation::ApplyTensorWithSizes( outputSize, formatCastOfSelf.options().dtype(at::kBool)); diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp index 0dafb37712..8ef6a11497 100644 --- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp @@ -188,7 +188,7 @@ Return: } else { - result = OpPreparation::ApplyTensor(outputSize, self.options()); + result = OpPreparation::ApplyTensorWithSizes(outputSize, self.options()); } // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp index adf42ed6b9..1f4331abc0 100644 --- a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp @@ -17,7 +17,7 @@ #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" #include "torch_npu/csrc/framework/utils/NpuUtils.h" - +#include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace at_npu diff --git a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp index 968e2a6419..bb0050fe18 100644 --- a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp @@ -52,17 +52,18 @@ namespace at_npu // calculate the output size auto outputSize = input_same_output_size(grad_output); + at::Tensor tmp_output = output; // output'format must be same with grad_output - if (CalcuOpUtil::get_tensor_npu_format(output) != CalcuOpUtil::get_tensor_npu_format(grad_output)) + if (CalcuOpUtil::get_tensor_npu_format(tmp_output) != CalcuOpUtil::get_tensor_npu_format(grad_output)) { - NPUNativeFunctions::npu_format_cast_(output, CalcuOpUtil::get_tensor_npu_format(grad_output)); + NPUNativeFunctions::npu_format_cast_(tmp_output, CalcuOpUtil::get_tensor_npu_format(grad_output)); } // construct the output tensor of the NPU at::Tensor grad_input = OpPreparation::ApplyTensor(grad_output, outputSize); // calculate the output result of the NPU - softmax_backward_out_npu(grad_input, grad_output, output, dim, self); + softmax_backward_out_npu(grad_input, grad_output, tmp_output, dim, self); return grad_input; } diff --git a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp index 9256fcc6c9..eb85dfefab 100644 --- a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp @@ -17,6 +17,7 @@ #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" #include "torch_npu/csrc/framework/utils/NpuUtils.h" +#include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace at_npu diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp index 5e9721944b..4f05619d18 100644 --- a/torch_npu/csrc/distributed/reducer.cpp +++ b/torch_npu/csrc/distributed/reducer.cpp @@ -1074,7 +1074,7 @@ void Reducer::copy_bucket_to_grad( if (!grad.defined()) { // Creates grad according to the "Gradient Layout Contract" // (see torch/csrc/grad/AccumulateGrad.h) - grad = OpPreparation::ApplyTensorWithFormat( + grad = at_npu::native::OpPreparation::ApplyTensorWithFormat( variable.sizes(), bucket_view.options(), variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true); diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index d17c46d3b0..e0ade98c71 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -199,7 +199,7 @@ namespace at_npu at::Tensor &OpPreparation::CastBackToOriFormat(at::Tensor &tensor) { auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_; - tensor.npu_format_cast_(tensor_desc.origin_format_); + NPUNativeFunctions::npu_format_cast_(tensor, tensor_desc.origin_format_); return tensor; } @@ -246,7 +246,7 @@ namespace at_npu auto format = InferFormat::GuessBaseFormat(sizes); return NPUNativeFunctions::empty_with_format( sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), - options.device_opt(), options.pinned_memory_opt(), fixFormat); + options.device_opt(), options.pinned_memory_opt(), format); } void OpPreparation::CheckMemory(const std::initializer_list &inputs, const std::initializer_list &outputs) -- Gitee From c4b424d00e4c0c636ab45841ebc87bcb701c0760 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Mon, 14 Feb 2022 10:13:41 +0800 Subject: [PATCH 07/12] Add Module & LayerNorm. --- torch_npu/__init__.py | 26 +++++---- torch_npu/utils/__init__.py | 36 +++++++++++++ torch_npu/utils/module.py | 105 ++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 10 deletions(-) create mode 100644 torch_npu/utils/__init__.py create mode 100644 torch_npu/utils/module.py diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index cedd54404f..5423b568e9 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -24,6 +24,8 @@ import torch_npu.npu.amp import torch_npu.distributed import torch_npu._C +from torch_npu.utils import nn_monkey_patches + from .version import __version__ as __version__ __all__ = [] @@ -35,16 +37,20 @@ for name in dir(torch_npu._C._VariableFunctions): globals()[name] = getattr(torch_npu._C._VariableFunctions, name) __all__.append(name) +all_monkey_patches = [ + ["npu", torch_npu.npu], + ["npu.amp", torch_npu.npu.amp], + ["autograd.profiler", torch_npu.npu.profiler], + ["distributed", torch_npu.distributed], + ["distributed.distributed_c10d", torch_npu.distributed.distributed_c10d], + ["nn.parallel.distributed._get_default_group", torch_npu.distributed.distributed_c10d._get_default_group] +] + +all_monkey_patches += nn_monkey_patches + -def _apply_patches(): - monkey_patches = [ - ["npu", torch_npu.npu], - ["npu.amp", torch_npu.npu.amp], - ["autograd.profiler", torch_npu.npu.profiler], - ["distributed", torch_npu.distributed], - ["distributed.distributed_c10d", torch_npu.distributed.distributed_c10d], - ["nn.parallel.distributed._get_default_group", torch_npu.distributed.distributed_c10d._get_default_group] - ] +def _apply_patches(monkey_patches): + def _getattr(module_list, root_module=torch): if len(module_list) <= 1: return root_module @@ -76,7 +82,7 @@ def _apply_patches(): setattr(dest_module, attr, getattr(patch, attr)) # Apply monkey-patches. -_apply_patches() +_apply_patches(all_monkey_patches) # NPU exit, need to synchronize devices def _npu_shutdown(): diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py new file mode 100644 index 0000000000..ed6f2abac7 --- /dev/null +++ b/torch_npu/utils/__init__.py @@ -0,0 +1,36 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .module import LayerNorm, Module + + +def _get_monkey_patches(): + nn_modules = ["activation", "adaptive", "batchnorm", "channelshuffle", "container", + "conv", "distance", "dropout", "flatten", "fold", "instancenorm", + "linear", "loss", "module", "normalization", "padding", "pixelshuffle", + "pooling", "rnn", "sparse", "transformer", "upsampling"] + _monkey_patches = [] + for module_name in nn_modules: + _monkey_patches.append([f"nn.modules.{module_name}.Module", Module]) + + _monkey_patches.append(["nn.Module", Module]) + _monkey_patches.append(["nn.modules.Module", Module]) + _monkey_patches.append(["nn.modules.normalization.LayerNorm", LayerNorm]) + _monkey_patches.append(["nn.modules.LayerNorm", LayerNorm]) + _monkey_patches.append(["nn.LayerNorm", LayerNorm]) + return _monkey_patches + + +nn_monkey_patches = _get_monkey_patches() diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py new file mode 100644 index 0000000000..c1918ab4c7 --- /dev/null +++ b/torch_npu/utils/module.py @@ -0,0 +1,105 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch +import torch_npu + + +class Module(torch.nn.Module): + + def npu(self, device=None): + r"""Moves all model parameters and buffers to the npu. + + This also makes associated parameters and buffers different objects. So + it should be called before constructing optimizer if the module will + live on npu while being optimized. + + Arguments: + device (int, optional): if specified, all parameters will be + copied to that device + + Returns: + Module: self + """ + if device is None: + device = torch.device("npu") + if torch.npu.is_available(): + with torch.no_grad(): + self.cast_weight(device) + return self._apply(lambda t: t.npu(device)) + + + def to(self, *args, **kwargs): + super(Module, self).to(*args, **args) + device, _, _, _ = torch._C._nn._parse_to(*args, **kwargs) + if torch.npu.is_available(): + with torch.no_grad(): + self.cast_weight(device) + + def cast_weight(self, device): + if device is None: + return + + if "npu" not in str(device): + return + + current_class = self.__class__ + if issubclass(current_class, torch.nn.Linear): + self.weight.data = self.weight.data.to(device) + self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) #ACL_FORMAT_FRACTAL_NZ + elif issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)): + if self.affine == True: + self.weight.data = self.weight.data.to(device) + self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3) #ACL_FORMAT_NC1HWC0 + self.bias.data = self.bias.data.to(device) + self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3) + self.running_mean.data = self.running_mean.data.to(device) + self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3) + self.running_var.data = self.running_var.data.to(device) + self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3) + elif issubclass(current_class, torch.nn.Conv2d): + if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0): + return + self.weight.data = self.weight.data.to(device) + self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4) #ACL_FORMAT_FRACTAL_Z + elif issubclass(current_class, torch.nn.Conv3d): + self.weight.data = self.weight.data.to(device) + self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float() #ACL_FRACTAL_Z_3D + elif ("MultiheadAttention" in str(current_class)): + if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \ + hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \ + hasattr(self,"v_proj_weight") and self.v_proj_weight is not None: + self.q_proj_weight.data = self.q_proj_weight.data.to(device) + self.q_proj_weight.data = torch_npu.npu_format_cast(self.q_proj_weight.data, 29) + self.k_proj_weight.data = self.k_proj_weight.data.to(device) + self.k_proj_weight.data = torch_npu.npu_format_cast(self.k_proj_weight.data, 29) + self.v_proj_weight.data = self.v_proj_weight.data.to(device) + self.v_proj_weight.data = torch_npu.npu_format_cast(self.v_proj_weight.data, 29) + + if self.children() is not None: + for sub_module in self.children(): + if isinstance(sub_module, Module): + sub_module.cast_weight(device) + + +class LayerNorm(torch.nn.LayerNorm): + + def forward(self, input: torch.Tensor) -> torch.Tensor: + if self.training: + return torch.nn.functional.layer_norm( + input, self.normalized_shape, self.weight, self.bias, self.eps) + else: + return torch_npu.npu_layer_norm_eval(input, self.normalized_shape, self.weight, self.bias, self.eps) -- Gitee From 6ebe3ad2cfba72ecabe099317552046030ec18d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Mon, 14 Feb 2022 11:04:01 +0800 Subject: [PATCH 08/12] Add module patch. --- torch_npu/__init__.py | 3 +- torch_npu/utils/__init__.py | 11 +-- torch_npu/utils/module.py | 173 ++++++++++++++++++++---------------- 3 files changed, 101 insertions(+), 86 deletions(-) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 5423b568e9..57b6be74d6 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -24,7 +24,7 @@ import torch_npu.npu.amp import torch_npu.distributed import torch_npu._C -from torch_npu.utils import nn_monkey_patches +from torch_npu.utils import nn_monkey_patches, apply_module_patch from .version import __version__ as __version__ @@ -83,6 +83,7 @@ def _apply_patches(monkey_patches): # Apply monkey-patches. _apply_patches(all_monkey_patches) +apply_module_patch() # NPU exit, need to synchronize devices def _npu_shutdown(): diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py index ed6f2abac7..092f9cebfb 100644 --- a/torch_npu/utils/__init__.py +++ b/torch_npu/utils/__init__.py @@ -13,20 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .module import LayerNorm, Module +from .module import LayerNorm, apply_module_patch def _get_monkey_patches(): - nn_modules = ["activation", "adaptive", "batchnorm", "channelshuffle", "container", - "conv", "distance", "dropout", "flatten", "fold", "instancenorm", - "linear", "loss", "module", "normalization", "padding", "pixelshuffle", - "pooling", "rnn", "sparse", "transformer", "upsampling"] _monkey_patches = [] - for module_name in nn_modules: - _monkey_patches.append([f"nn.modules.{module_name}.Module", Module]) - - _monkey_patches.append(["nn.Module", Module]) - _monkey_patches.append(["nn.modules.Module", Module]) _monkey_patches.append(["nn.modules.normalization.LayerNorm", LayerNorm]) _monkey_patches.append(["nn.modules.LayerNorm", LayerNorm]) _monkey_patches.append(["nn.LayerNorm", LayerNorm]) diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py index c1918ab4c7..5620d851c2 100644 --- a/torch_npu/utils/module.py +++ b/torch_npu/utils/module.py @@ -13,86 +13,109 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import warnings import torch import torch_npu -class Module(torch.nn.Module): - - def npu(self, device=None): - r"""Moves all model parameters and buffers to the npu. - - This also makes associated parameters and buffers different objects. So - it should be called before constructing optimizer if the module will - live on npu while being optimized. - - Arguments: - device (int, optional): if specified, all parameters will be - copied to that device - - Returns: - Module: self - """ - if device is None: - device = torch.device("npu") - if torch.npu.is_available(): - with torch.no_grad(): - self.cast_weight(device) - return self._apply(lambda t: t.npu(device)) - - - def to(self, *args, **kwargs): - super(Module, self).to(*args, **args) - device, _, _, _ = torch._C._nn._parse_to(*args, **kwargs) - if torch.npu.is_available(): - with torch.no_grad(): - self.cast_weight(device) - - def cast_weight(self, device): - if device is None: - return - - if "npu" not in str(device): - return - - current_class = self.__class__ - if issubclass(current_class, torch.nn.Linear): +def npu(self, device=None): + r"""Moves all model parameters and buffers to the npu. + + This also makes associated parameters and buffers different objects. So + it should be called before constructing optimizer if the module will + live on npu while being optimized. + + Arguments: + device (int, optional): if specified, all parameters will be + copied to that device + + Returns: + Module: self + """ + if device is None: + device = torch.device("npu") + if torch_npu.npu.is_available(): + with torch.no_grad(): + self.cast_weight(device) + return self._apply(lambda t: t.npu(device)) + + +def to(self, *args, **kwargs): + device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs) + + if dtype is not None: + if not (dtype.is_floating_point or dtype.is_complex): + raise TypeError('nn.Module.to only accepts floating point or complex ' + 'dtypes, but got desired dtype={}'.format(dtype)) + if dtype.is_complex: + warnings.warn( + "Complex modules are a new feature under active development whose design may change, " + "and some modules might not work as expected when using complex tensors as parameters or buffers. " + "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md " + "if a complex module does not work as expected.") + if torch_npu.npu.is_available(): + with torch.no_grad(): + self.cast_weight(device) + + def convert(t): + if convert_to_format is not None and t.dim() == 4: + return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, + non_blocking, memory_format=convert_to_format) + return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking) + + return self._apply(convert) + + +def cast_weight(self, device): + if device is None: + return + + if "npu" not in str(device): + return + + current_class = self.__class__ + if issubclass(current_class, torch.nn.Linear): + self.weight.data = self.weight.data.to(device) + self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) #ACL_FORMAT_FRACTAL_NZ + elif issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)): + if self.affine == True: self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) #ACL_FORMAT_FRACTAL_NZ - elif issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)): - if self.affine == True: - self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3) #ACL_FORMAT_NC1HWC0 - self.bias.data = self.bias.data.to(device) - self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3) - self.running_mean.data = self.running_mean.data.to(device) - self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3) - self.running_var.data = self.running_var.data.to(device) - self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3) - elif issubclass(current_class, torch.nn.Conv2d): - if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0): - return - self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4) #ACL_FORMAT_FRACTAL_Z - elif issubclass(current_class, torch.nn.Conv3d): - self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float() #ACL_FRACTAL_Z_3D - elif ("MultiheadAttention" in str(current_class)): - if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \ - hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \ - hasattr(self,"v_proj_weight") and self.v_proj_weight is not None: - self.q_proj_weight.data = self.q_proj_weight.data.to(device) - self.q_proj_weight.data = torch_npu.npu_format_cast(self.q_proj_weight.data, 29) - self.k_proj_weight.data = self.k_proj_weight.data.to(device) - self.k_proj_weight.data = torch_npu.npu_format_cast(self.k_proj_weight.data, 29) - self.v_proj_weight.data = self.v_proj_weight.data.to(device) - self.v_proj_weight.data = torch_npu.npu_format_cast(self.v_proj_weight.data, 29) - - if self.children() is not None: - for sub_module in self.children(): - if isinstance(sub_module, Module): - sub_module.cast_weight(device) + self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3) #ACL_FORMAT_NC1HWC0 + self.bias.data = self.bias.data.to(device) + self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3) + self.running_mean.data = self.running_mean.data.to(device) + self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3) + self.running_var.data = self.running_var.data.to(device) + self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3) + elif issubclass(current_class, torch.nn.Conv2d): + if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0): + return + self.weight.data = self.weight.data.to(device) + self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4) #ACL_FORMAT_FRACTAL_Z + elif issubclass(current_class, torch.nn.Conv3d): + self.weight.data = self.weight.data.to(device) + self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float() #ACL_FRACTAL_Z_3D + elif ("MultiheadAttention" in str(current_class)): + if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \ + hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \ + hasattr(self,"v_proj_weight") and self.v_proj_weight is not None: + self.q_proj_weight.data = self.q_proj_weight.data.to(device) + self.q_proj_weight.data = torch_npu.npu_format_cast(self.q_proj_weight.data, 29) + self.k_proj_weight.data = self.k_proj_weight.data.to(device) + self.k_proj_weight.data = torch_npu.npu_format_cast(self.k_proj_weight.data, 29) + self.v_proj_weight.data = self.v_proj_weight.data.to(device) + self.v_proj_weight.data = torch_npu.npu_format_cast(self.v_proj_weight.data, 29) + + if self.children() is not None: + for sub_module in self.children(): + if isinstance(sub_module, torch.nn.Module): + sub_module.cast_weight(device) + + +def apply_module_patch(): + torch.nn.Module.npu = npu + torch.nn.Module.to = to + torch.nn.Module.cast_weight = cast_weight class LayerNorm(torch.nn.LayerNorm): -- Gitee From 61c740f02e1938c3ff153fb8cfb0a4579be1c982 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Mon, 14 Feb 2022 11:43:45 +0800 Subject: [PATCH 09/12] Update custom ops calling. --- .../test_batchnorm_gather_stats_with_counts.py | 4 ++-- test/test_network_ops/test_uniform_.py | 2 +- torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/IndexKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/WhereKernelNpu.cpp | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py index 7f9e5e4d0a..52585e2311 100644 --- a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py +++ b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py @@ -47,7 +47,7 @@ class TestBatchNormGatherStatsWithCounts(TestCase): input1 = np.array(data).astype(dtype) npu_counts = torch.from_numpy(input1).to("npu:0") if npu_format != -1: - npu_counts = npu_counts.npu_format_cast(npu_format) + npu_counts = torch_npu.npu_format_cast(npu_counts, npu_format) return npu_counts def create_counts_tensor16(self, item): @@ -58,7 +58,7 @@ class TestBatchNormGatherStatsWithCounts(TestCase): input1 = np.array(data).astype(dtype) npu_counts = torch.from_numpy(input1).to("npu:0") if npu_format != -1: - npu_counts = npu_counts.npu_format_cast(npu_format) + npu_counts = torch_npu.npu_format_cast(npu_counts, npu_format) return npu_counts def test_batch_norm_gather_stats_with_counts(self, device): diff --git a/test/test_network_ops/test_uniform_.py b/test/test_network_ops/test_uniform_.py index 893adf140e..de4a3a9669 100644 --- a/test/test_network_ops/test_uniform_.py +++ b/test/test_network_ops/test_uniform_.py @@ -39,7 +39,7 @@ class TestUniform(TestCase): for item in shape_format: input1 = torch.zeros(item[0], dtype=item[3]).npu() - input1.npu_format_cast(3) + input1 = torch_npu.npu_format_cast(input1, 3) input1.uniform_(item[1], item[2]) self.assertTrue(item[1] <= input1.min()) self.assertTrue(item[2] >= input1.max()) diff --git a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp index 2e5df37da0..fa8b070662 100644 --- a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp @@ -112,7 +112,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, const at::Ten auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output at::Tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, ref_tensor.options(), CalcuOpUtil::get_tensor_npu_format(ref_tensor)); @@ -128,7 +128,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, at::Scalar ot auto outputSize = input_same_output_size(self); // construct the output at::Tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp index 2f696eb0ad..ea5bf340cc 100644 --- a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp @@ -83,9 +83,9 @@ at::Tensor pure_bmm_v2_npu(const at::Tensor& self, const at::Tensor& mat2, const at::Tensor result; if ((tensor1.scalar_type() == at::ScalarType::Half)) { - result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ); + result = NPUNativeFunctions::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND); + result = NPUNativeFunctions::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND); } at::Tensor contiguous_self = tensor1; diff --git a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp index 40bf84de52..1aa6ffb9dc 100644 --- a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp @@ -42,7 +42,7 @@ at::Tensor NPUNativeFunctions::index(const at::Tensor& self, const torch::List NPUNativeFunctions::where(const at::Tensor& condition) { at::Tensor formatCastOfCondition = condition; if (condition.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ != ACL_FORMAT_ND) { - formatCastOfCondition = formatCastOfCondition.npu_format_cast(ACL_FORMAT_ND); + formatCastOfCondition = NPUNativeFunctions::npu_format_cast(formatCastOfCondition, ACL_FORMAT_ND); } if (condition.scalar_type() == at::ScalarType::Half) { formatCastOfCondition = NPUNativeFunctions::npu_dtype_cast(formatCastOfCondition, at::ScalarType::Float); -- Gitee From e39ce7f8716f58939a86851d2a1af9efd84c0b67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Mon, 14 Feb 2022 12:09:30 +0800 Subject: [PATCH 10/12] Replace with ApplyTensor. --- torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp index fa8b070662..9e42f4f0ef 100644 --- a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp @@ -112,7 +112,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, const at::Ten auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output at::Tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, ref_tensor.options(), CalcuOpUtil::get_tensor_npu_format(ref_tensor)); @@ -128,7 +128,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, at::Scalar ot auto outputSize = input_same_output_size(self); // construct the output at::Tensor of the NPU - at::Tensor result = NPUNativeFunctions::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp index ea5bf340cc..f928e97738 100644 --- a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp @@ -83,9 +83,9 @@ at::Tensor pure_bmm_v2_npu(const at::Tensor& self, const at::Tensor& mat2, const at::Tensor result; if ((tensor1.scalar_type() == at::ScalarType::Half)) { - result = NPUNativeFunctions::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ); + result = OpPreparation::ApplyTensorWithFormat(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = NPUNativeFunctions::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND); + result = OpPreparation::ApplyTensorWithFormat(output_size, tensor1.options(), ACL_FORMAT_ND); } at::Tensor contiguous_self = tensor1; -- Gitee From 083ffb9385ed7795724f1370db7fffb000d9fb68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Mon, 14 Feb 2022 12:47:36 +0800 Subject: [PATCH 11/12] Fix codecheck. --- torch_npu/__init__.py | 2 +- .../csrc/aten/common/TensorFactories.cpp | 2 +- torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp | 4 ++-- .../csrc/aten/ops/ZerosLikeKernelNpu.cpp | 4 ++-- torch_npu/csrc/distributed/reducer.cpp | 4 ++-- .../framework/contiguous/combined_opt.cpp | 2 +- .../csrc/framework/utils/OpPreparation.cpp | 8 +++---- torch_npu/utils/module.py | 23 ++++++++----------- 8 files changed, 23 insertions(+), 26 deletions(-) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 57b6be74d6..de976de730 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -60,7 +60,7 @@ def _apply_patches(monkey_patches): else: empty_module_name = f'{root_module.__name__}.{module_list[0]}' sys.modules[empty_module_name] = types.ModuleType(empty_module_name) - setattr(root_module, module_list[0], sys.modules[empty_module_name]) + setattr(root_module, module_list[0], sys.modules.get(empty_module_name)) return _getattr(module_list[1:], getattr(root_module, module_list[0])) for patch_pair in monkey_patches: diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp index ac32cb49bb..c2a9ae6eab 100644 --- a/torch_npu/csrc/aten/common/TensorFactories.cpp +++ b/torch_npu/csrc/aten/common/TensorFactories.cpp @@ -628,7 +628,7 @@ namespace at_npu AT_ASSERT(result.is_contiguous()); AT_DISPATCH_ALL_TYPES_AND_COMPLEX(result.scalar_type(), "tensor_npu", [&] { std::copy( - values.begin(), values.end(), result.template data_ptr()); }); + values.begin(), values.end(), result.template data_ptr()); }); return result; } diff --git a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp index 8343864e43..4e65798b22 100644 --- a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp @@ -46,8 +46,8 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU at::Tensor result = NPUNativeFunctions::empty_with_format( - outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt, - CalcuOpUtil::get_tensor_npu_format(self)); + outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt, + CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPUc return NPUNativeFunctions::one_(result); } diff --git a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp index 2f4775751d..8147622a01 100644 --- a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp @@ -57,8 +57,8 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU at::Tensor result = NPUNativeFunctions::empty_with_format( - outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt, - CalcuOpUtil::get_tensor_npu_format(self)); + outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt, + CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU return result.zero_(); diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp index 4f05619d18..81f7f04968 100644 --- a/torch_npu/csrc/distributed/reducer.cpp +++ b/torch_npu/csrc/distributed/reducer.cpp @@ -1075,8 +1075,8 @@ void Reducer::copy_bucket_to_grad( // Creates grad according to the "Gradient Layout Contract" // (see torch/csrc/grad/AccumulateGrad.h) grad = at_npu::native::OpPreparation::ApplyTensorWithFormat( - variable.sizes(), bucket_view.options(), - variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); + variable.sizes(), bucket_view.options(), + variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true); } else { at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true); diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp index 68c92e048b..1ce04a827f 100644 --- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp @@ -389,7 +389,7 @@ namespace at_npu // baseInfo = inferred info(infer_size, infer_stride, infer_offset) // If the first inferred tensor can be optimized, store its info. if (can_infer_view_tensor( - src, temp_src, infer_size, infer_stride, infer_offset) && + src, temp_src, infer_size, infer_stride, infer_offset) && emplace_info( temp_src, view_infos, view_offsets, infer_offset, max_len)) { diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index e0ade98c71..3726a9765a 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -237,16 +237,16 @@ namespace at_npu { auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format); return NPUNativeFunctions::empty_with_format( - sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), - options.device_opt(), options.pinned_memory_opt(), fixFormat); + sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), + options.device_opt(), options.pinned_memory_opt(), fixFormat); } at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10::TensorOptions &options) { auto format = InferFormat::GuessBaseFormat(sizes); return NPUNativeFunctions::empty_with_format( - sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), - options.device_opt(), options.pinned_memory_opt(), format); + sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), + options.device_opt(), options.pinned_memory_opt(), format); } void OpPreparation::CheckMemory(const std::initializer_list &inputs, const std::initializer_list &outputs) diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py index 5620d851c2..5de085390e 100644 --- a/torch_npu/utils/module.py +++ b/torch_npu/utils/module.py @@ -67,35 +67,32 @@ def to(self, *args, **kwargs): def cast_weight(self, device): - if device is None: - return - - if "npu" not in str(device): + if device is None or "npu" not in str(device): return current_class = self.__class__ if issubclass(current_class, torch.nn.Linear): self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) #ACL_FORMAT_FRACTAL_NZ - elif issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)): - if self.affine == True: + self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) # ACL_FORMAT_FRACTAL_NZ + if issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)): + if self.affine: self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3) #ACL_FORMAT_NC1HWC0 + self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3) # ACL_FORMAT_NC1HWC0 self.bias.data = self.bias.data.to(device) self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3) self.running_mean.data = self.running_mean.data.to(device) self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3) self.running_var.data = self.running_var.data.to(device) self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3) - elif issubclass(current_class, torch.nn.Conv2d): + if issubclass(current_class, torch.nn.Conv2d): if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0): return self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4) #ACL_FORMAT_FRACTAL_Z - elif issubclass(current_class, torch.nn.Conv3d): + self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4) # ACL_FORMAT_FRACTAL_Z + if issubclass(current_class, torch.nn.Conv3d): self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float() #ACL_FRACTAL_Z_3D - elif ("MultiheadAttention" in str(current_class)): + self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float() # ACL_FRACTAL_Z_3D + if ("MultiheadAttention" in str(current_class)): if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \ hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \ hasattr(self,"v_proj_weight") and self.v_proj_weight is not None: -- Gitee From cfbe896ea6c3686a902860489d6d26132ceb0eef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= Date: Mon, 14 Feb 2022 14:23:15 +0800 Subject: [PATCH 12/12] Fix Cyclomatic Complexity. --- torch_npu/utils/module.py | 79 +++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 36 deletions(-) diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py index 5de085390e..0d275ef923 100644 --- a/torch_npu/utils/module.py +++ b/torch_npu/utils/module.py @@ -67,46 +67,53 @@ def to(self, *args, **kwargs): def cast_weight(self, device): + + def _format_cast(module, class_name): + if issubclass(class_name, torch.nn.Linear): + module.weight.data = module.weight.data.to(device) + module.weight.data = torch_npu.npu_format_cast(module.weight.data, 29) # ACL_FORMAT_FRACTAL_NZ + if issubclass(class_name, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)): + if module.affine: + module.weight.data = module.weight.data.to(device) + module.weight.data = torch_npu.npu_format_cast(module.weight.data, 3) # ACL_FORMAT_NC1HWC0 + module.bias.data = module.bias.data.to(device) + module.bias.data = torch_npu.npu_format_cast(module.bias.data, 3) + module.running_mean.data = module.running_mean.data.to(device) + module.running_mean.data = torch_npu.npu_format_cast(module.running_mean.data, 3) + module.running_var.data = module.running_var.data.to(device) + module.running_var.data = torch_npu.npu_format_cast(module.running_var.data, 3) + if issubclass(class_name, torch.nn.Conv2d): + if (module.in_channels == module.groups and module.groups > 1 + and module.weight.size(0) % module.in_channels == 0): + return + module.weight.data = module.weight.data.to(device) + module.weight.data = torch_npu.npu_format_cast(module.weight.data, 4) # ACL_FORMAT_FRACTAL_Z + if issubclass(class_name, torch.nn.Conv3d): + module.weight.data = module.weight.data.to(device) + module.weight.data = torch_npu.npu_format_cast(module.weight.data.half(), 33).float() # ACL_FRACTAL_Z_3D + if "MultiheadAttention" in str(class_name) and \ + hasattr(module,"q_proj_weight") and module.q_proj_weight and \ + hasattr(module,"k_proj_weight") and module.k_proj_weight and \ + hasattr(module,"v_proj_weight") and module.v_proj_weight: + module.q_proj_weight.data = module.q_proj_weight.data.to(device) + module.q_proj_weight.data = torch_npu.npu_format_cast(module.q_proj_weight.data, 29) + module.k_proj_weight.data = module.k_proj_weight.data.to(device) + module.k_proj_weight.data = torch_npu.npu_format_cast(module.k_proj_weight.data, 29) + module.v_proj_weight.data = module.v_proj_weight.data.to(device) + module.v_proj_weight.data = torch_npu.npu_format_cast(module.v_proj_weight.data, 29) + if device is None or "npu" not in str(device): return current_class = self.__class__ - if issubclass(current_class, torch.nn.Linear): - self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) # ACL_FORMAT_FRACTAL_NZ - if issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)): - if self.affine: - self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3) # ACL_FORMAT_NC1HWC0 - self.bias.data = self.bias.data.to(device) - self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3) - self.running_mean.data = self.running_mean.data.to(device) - self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3) - self.running_var.data = self.running_var.data.to(device) - self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3) - if issubclass(current_class, torch.nn.Conv2d): - if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0): - return - self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4) # ACL_FORMAT_FRACTAL_Z - if issubclass(current_class, torch.nn.Conv3d): - self.weight.data = self.weight.data.to(device) - self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float() # ACL_FRACTAL_Z_3D - if ("MultiheadAttention" in str(current_class)): - if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \ - hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \ - hasattr(self,"v_proj_weight") and self.v_proj_weight is not None: - self.q_proj_weight.data = self.q_proj_weight.data.to(device) - self.q_proj_weight.data = torch_npu.npu_format_cast(self.q_proj_weight.data, 29) - self.k_proj_weight.data = self.k_proj_weight.data.to(device) - self.k_proj_weight.data = torch_npu.npu_format_cast(self.k_proj_weight.data, 29) - self.v_proj_weight.data = self.v_proj_weight.data.to(device) - self.v_proj_weight.data = torch_npu.npu_format_cast(self.v_proj_weight.data, 29) - - if self.children() is not None: - for sub_module in self.children(): - if isinstance(sub_module, torch.nn.Module): - sub_module.cast_weight(device) + _format_cast(self, current_class) + + if not self.children: + return + + for sub_module in self.children(): + if isinstance(sub_module, torch.nn.Module): + sub_module.cast_weight(device) def apply_module_patch(): -- Gitee