From 65817da2c10d71237deac5a158d6b8abcdfaf427 Mon Sep 17 00:00:00 2001 From: "zhousinan@huawei.com" Date: Tue, 25 Jan 2022 16:55:48 +0800 Subject: [PATCH 1/6] add torch_npu/csrc/aten/common/CopyKernelNpu.cpp --- torch_npu/csrc/aten/common/CopyKernelNpu.cpp | 8 +- .../csrc/aten/common/CopyMemoryKernel.cpp | 12 +-- .../csrc/aten/common/FormatCastHelper.cpp | 13 +-- .../csrc/aten/common/FormatCastKernelNpu.cpp | 19 ++--- .../csrc/aten/common/LocalScalarDenseNpu.cpp | 5 +- torch_npu/csrc/aten/common/ResizeNpu.h | 4 +- .../csrc/aten/common/TensorFactories.cpp | 7 +- torch_npu/csrc/aten/ops/AddKernelNpu.cpp | 5 +- torch_npu/csrc/aten/ops/MmKernelNpu.cpp | 13 +-- .../aten/ops/ThresholdBackwardKernelNpu.cpp | 4 +- torch_npu/csrc/core/tensor_impl.cpp | 12 +++ torch_npu/csrc/core/tensor_impl.h | 6 ++ torch_npu/csrc/distributed/Init.cpp | 3 +- .../csrc/distributed/ProcessGroupHCCL.cpp | 3 +- torch_npu/csrc/distributed/reducer.cpp | 12 +-- torch_npu/csrc/framework/FormatHelper.cpp | 10 +-- torch_npu/csrc/framework/FormatHelper.h | 4 +- torch_npu/csrc/framework/InferFormat.cpp | 10 +-- torch_npu/csrc/framework/OpCmdHelper.cpp | 20 ++--- torch_npu/csrc/framework/OpParamMaker.h | 4 +- .../csrc/framework/StorageDescHelper.cpp | 80 ++++++++++++------- torch_npu/csrc/framework/StorageDescHelper.h | 16 ++-- .../framework/contiguous/ContiguousOpt.cpp | 6 +- .../csrc/framework/contiguous/ReshapeOpt.cpp | 10 +-- .../framework/contiguous/combined_opt.cpp | 28 +++---- .../framework/contiguous/indexing_opt.cpp | 15 ++-- .../csrc/framework/contiguous/permute_opt.cpp | 14 ++-- .../framework/contiguous/reshapeV2_opt.cpp | 6 +- .../csrc/framework/contiguous/reshape_opt.cpp | 2 +- .../csrc/framework/contiguous/select_opt.cpp | 7 +- .../csrc/framework/contiguous/slice_opt.cpp | 8 +- .../csrc/framework/contiguous/unfold_opt.cpp | 7 +- 32 files changed, 211 insertions(+), 162 deletions(-) diff --git a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp index a283c31d63..3525be440a 100644 --- a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp @@ -68,8 +68,8 @@ void copy_kernel_npu( at::Tensor attrTensor = CalcuOpUtil::copy_tensor_host_to_device( at::from_blob(value.data(), {value.size()}, dtype(at::ScalarType::Long))); - auto src_desc_bp = src.storage().get_npu_desc(); - auto self_desc_bp = self.storage().get_npu_desc(); + auto* src_desc_bp = torch_npu::NPUTensorImpl::GetStorageInfo(src); + auto* self_desc_bp = torch_npu::NPUTensorImpl::GetStorageInfo(self); // The action of PTcopy_ is defined by attrTensor, so the member of NPUStorageDesc // can not affect the result, but the PTcopy_ will check base_size and storage_size, @@ -83,8 +83,8 @@ void copy_kernel_npu( CalcuOpUtil::execute_npu_operate("PTcopy_", inputs, outputs, {}); - StorageDescHelper::CopyDesc(src, src_desc_bp); - StorageDescHelper::CopyDesc(self, self_desc_bp); + StorageDescHelper::CopyDesc(src, *src_desc_bp); + StorageDescHelper::CopyDesc(self, *self_desc_bp); } // the dst and src are same dtype diff --git a/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp b/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp index d39c5032e0..933701445c 100644 --- a/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp @@ -38,27 +38,27 @@ at::Tensor& NPUNativeFunctions::copy_memory_(at::Tensor& self, const at::Tensor& AT_ASSERT( src.device().index() == self.device().index(), "input tensors of copy_memory_ should have same device index"); - auto dst_desc = self.storage().unsafeGetStorageImpl()->npu_desc_; - auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; + auto* dst_desc = torch_npu::NPUTensorImpl::GetStorageInfo(self); + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); int dst_size = 0; int src_size = 0; if (FormatHelper::IsPadded(&self)) { AT_ASSERT(self.storage_offset() == 0); - dst_size = at::prod_intlist(dst_desc.storage_sizes_); + dst_size = at::prod_intlist(dst_desc->storage_sizes_); } else { auto dst_element = at::prod_intlist(self.sizes()); - auto dst_storage = at::prod_intlist(dst_desc.storage_sizes_); + auto dst_storage = at::prod_intlist(dst_desc->storage_sizes_); dst_size = (dst_element > dst_storage) ? dst_storage : dst_element; } if (FormatHelper::IsPadded(&src)) { AT_ASSERT(src.storage_offset() == 0); - src_size = at::prod_intlist(src_desc.storage_sizes_); + src_size = at::prod_intlist(src_desc->storage_sizes_); } else { auto src_element = at::prod_intlist(src.sizes()); - auto src_storage = at::prod_intlist(src_desc.storage_sizes_); + auto src_storage = at::prod_intlist(src_desc->storage_sizes_); src_size = (src_element > src_storage) ? src_storage : src_element; } diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp index aa98978e8a..52e180146f 100644 --- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp +++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp @@ -13,16 +13,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/aten/common/FormatCastHelper.h" +#include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { namespace native { bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor& dst) { - auto src_format = src.storage().get_npu_desc().npu_format_; - auto dst_format = dst.storage().get_npu_desc().npu_format_; + auto src_format = torch_npu::NPUTensorImpl::GetStorageInfo(src)->npu_format_; + auto dst_format = torch_npu::NPUTensorImpl::GetStorageInfo(dst)->npu_format_; return FormatHelper::GetBaseFormat(src_format) == FormatHelper::GetBaseFormat(dst_format); } @@ -35,12 +36,12 @@ void FormatCastHelper::format_cast_as_base_format(const at::Tensor& src, aclForm AT_ASSERT(FormatHelper::IsBaseFormatType(format), "dst format must be base format"); AT_ASSERT(FormatHelper::IsBaseFormatType(src), "src format must be base format"); - auto& src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); // due to CANN principle : if the ori format of a tensor is the // same as the npu format, then its base shape must be same as storage shape // so we should not change the storage shape when format cast between base format - src_desc.origin_format_ = format; - src_desc.npu_format_ = format; + src_desc->origin_format_ = format; + src_desc->npu_format_ = format; return; } diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp index 2139cfdb4a..2be6a058e4 100644 --- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp @@ -45,9 +45,9 @@ at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src) { // convert src from src_format to dst_format, write the result into dst at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& dst, const at::Tensor& src) { - c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - c10::NPUStorageDesc dst_desc = dst.storage().unsafeGetStorageImpl()->npu_desc_; - if (src_desc.npu_format_ == dst_desc.npu_format_) { + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); + auto* dst_desc = torch_npu::NPUTensorImpl::GetStorageInfo(dst); + if (src_desc->npu_format_ == dst_desc->npu_format_) { dst.copy_(src); return dst; } @@ -62,8 +62,9 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& dst, const at::Tens at::Tensor NPUNativeFunctions::npu_format_cast( const at::Tensor& src, int64_t acl_format) { - c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - if (src_desc.npu_format_ == acl_format) { + + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); + if (src_desc->npu_format_ == acl_format) { NPU_LOGD("no need to do format cast"); return src; } @@ -77,7 +78,7 @@ at::Tensor NPUNativeFunctions::npu_format_cast( "can not cast format when src is not float32 or float16"); at::Tensor dst = at::empty_with_format( - src_desc.base_sizes_, src.options(), acl_format); + src_desc->base_sizes_, src.options(), acl_format); // calculate the output result of the NPU format_cast_impl_out_npu(dst, src); @@ -92,8 +93,8 @@ at::Tensor NPUNativeFunctions::npu_format_cast( at::Tensor& NPUNativeFunctions::npu_format_cast_( at::Tensor& src, int64_t acl_format) { - c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - if (src_desc.npu_format_ == acl_format) { + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); + if (src_desc->npu_format_ == acl_format) { return src; } if (FormatHelper::IsBaseFormatType(src) && @@ -106,7 +107,7 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_( "can not cast format when src is not float32 or float16"); at::Tensor dst = at::empty_with_format( - src_desc.base_sizes_, src.options(), acl_format); + src_desc->base_sizes_, src.options(), acl_format); // calculate the output result of the NPU format_cast_impl_out_npu(dst, src); diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp index 8d006e1055..91c3ea0c16 100644 --- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp +++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp @@ -17,9 +17,8 @@ #include #include #include - -#include "third_party/acl/inc/acl/acl_base.h" -#include "third_party/acl/inc/acl/acl_rt.h" +#include +#include #include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace at_npu { diff --git a/torch_npu/csrc/aten/common/ResizeNpu.h b/torch_npu/csrc/aten/common/ResizeNpu.h index 2f24a77e7b..2b8fed8e42 100644 --- a/torch_npu/csrc/aten/common/ResizeNpu.h +++ b/torch_npu/csrc/aten/common/ResizeNpu.h @@ -42,7 +42,9 @@ static void storage_resize_npu( at::DataPtr old_data = storage.set_data_ptr(std::move(new_data)); ptrdiff_t old_size = storage.nbytes(); storage.set_nbytes(size); - StorageDescHelper::UpdateDesc(storage.npu_desc_, new_size); + + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(storage); + StorageDescHelper::UpdateDesc(*src_desc, new_size); if (old_data != nullptr) { ptrdiff_t copy_size = old_size; diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp index 3d95b195a1..2960e528f6 100644 --- a/torch_npu/csrc/aten/common/TensorFactories.cpp +++ b/torch_npu/csrc/aten/common/TensorFactories.cpp @@ -228,8 +228,7 @@ namespace at_npu } else { - auto npu_format = - self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_; + auto npu_format = torch_npu::NPUTensorImpl::GetStorageInfo(self)->npu_format_; result = at::empty_with_format(self.sizes(), self.options(), npu_format); } } @@ -656,9 +655,9 @@ namespace at_npu at::Tensor NPUNativeFunctions::clone(const at::Tensor &src, c10::optional format) { - auto desc = src.storage().unsafeGetStorageImpl()->npu_desc_; + auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); auto formatSelf = OpPreparation::ApplyTensorWithFormat( - src.sizes(), src.options(), desc.npu_format_); + src.sizes(), src.options(), desc->npu_format_); if (try_to_optimize_copy_with_any_format(formatSelf, src)) { return formatSelf; diff --git a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp index 363599e38a..cee30addb9 100644 --- a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp @@ -21,6 +21,7 @@ #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -169,9 +170,9 @@ namespace at_npu } else { - c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); at::Tensor src_new = at::empty_with_format( - src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); + src_desc->base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); src_new.set_( src.storage(), src_new.storage_offset(), diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp index e836bee17c..944bdea0f7 100644 --- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp @@ -20,6 +20,7 @@ #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -45,7 +46,7 @@ Return: return false; } int64_t numel = 1; - auto storageSize = tensor.storage().get_npu_desc().storage_sizes_; + auto storageSize = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->storage_sizes_; for (int i = 0; i < storageSize.size(); i++) { @@ -71,7 +72,7 @@ Return: const at::Tensor &tensor, bool is_transpose_flex) { - auto base_sizes = tensor.storage().get_npu_desc().base_sizes_; + auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_; if (is_transpose_flex && base_sizes.size() == tensor.dim() && tensor.size(-1) == base_sizes[tensor.dim() - 2] && tensor.size(-2) == base_sizes[tensor.dim() - 1]) @@ -95,8 +96,8 @@ Return: { at::Tensor contiguousResult = result.is_contiguous() ? result : result.contiguous(); - c10::NPUStorageDesc self_desc = self.storage().get_npu_desc(); - c10::NPUStorageDesc mat2_desc = mat2.storage().get_npu_desc(); + auto* self_desc = torch_npu::NPUTensorImpl::GetStorageInfo(self); + auto* mat2_desc = torch_npu::NPUTensorImpl::GetStorageInfo(mat2); bool isSelfT_flex = is_transpose_last_two_dims_flex(self); bool isMat2T_flex = is_transpose_last_two_dims_flex(mat2); bool isSelfT_strict = is_transpose_last_two_dims_strict(self, isSelfT_flex); @@ -159,11 +160,11 @@ Return: // set_transposed_npu_desc if (isSelfT_flex && (!isSelfT_strict)) { - self.storage().unsafeGetStorageImpl()->npu_desc_ = self_desc; + *torch_npu::NPUTensorImpl::GetStorageInfo(self) = *self_desc; } if (isMat2T_flex && (!isMat2T_strict)) { - mat2.storage().unsafeGetStorageImpl()->npu_desc_ = mat2_desc; + *torch_npu::NPUTensorImpl::GetStorageInfo(mat2) = *mat2_desc; } if (!result.is_contiguous()) diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp index 537bcc2444..900c664fc5 100644 --- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp @@ -66,9 +66,9 @@ namespace at_npu outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // use 5HD in Relu - if ((grad_output.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ == + if ((torch_npu::NPUTensorImpl::GetStorageInfo(grad_output)->npu_format_ == ACL_FORMAT_NCHW) && - (self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ == + (torch_npu::NPUTensorImpl::GetStorageInfo(self)->npu_format_ == ACL_FORMAT_NC1HWC0)) { at::Tensor grad_output_5HD = diff --git a/torch_npu/csrc/core/tensor_impl.cpp b/torch_npu/csrc/core/tensor_impl.cpp index f9cfa14ff1..d7f670534b 100644 --- a/torch_npu/csrc/core/tensor_impl.cpp +++ b/torch_npu/csrc/core/tensor_impl.cpp @@ -83,4 +83,16 @@ namespace torch_npu return impl; } +c10::npu::NPUCachingAllocator::NPUStorageInfo* NPUTensorImpl::GetStorageInfo(const at::Tensor& src) { + return c10::npu::NPUCachingAllocator::getStorageInfo(src.storage().unsafeGetStorageImpl()->data()); +} + +c10::npu::NPUCachingAllocator::NPUStorageInfo* NPUTensorImpl::GetStorageInfo(at::Tensor& src) { + return c10::npu::NPUCachingAllocator::getStorageInfo(src.storage().unsafeGetStorageImpl()->data()); +} + +c10::npu::NPUCachingAllocator::NPUStorageInfo* NPUTensorImpl::GetStorageInfo(at::StorageImpl& src) { + return c10::npu::NPUCachingAllocator::getStorageInfo(src.data()); +} + } diff --git a/torch_npu/csrc/core/tensor_impl.h b/torch_npu/csrc/core/tensor_impl.h index f37dcabf6e..10c9e6f0e1 100644 --- a/torch_npu/csrc/core/tensor_impl.h +++ b/torch_npu/csrc/core/tensor_impl.h @@ -19,6 +19,7 @@ #include #include +#include "c10/npu/NPUCachingAllocator.h" namespace torch_npu { @@ -45,6 +46,11 @@ public: c10::VariableVersion&& version_counter, bool allow_tensor_metadata_change) const final; + + static c10::npu::NPUCachingAllocator::NPUStorageInfo* GetStorageInfo(at::Tensor& src); + static c10::npu::NPUCachingAllocator::NPUStorageInfo* GetStorageInfo(const at::Tensor& src); + static c10::npu::NPUCachingAllocator::NPUStorageInfo* GetStorageInfo(at::StorageImpl& src); + public: NPUTensorImpl(const NPUTensorImpl&) = delete; NPUTensorImpl& operator=(const NPUTensorImpl&) = delete; diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 951ba59607..e0d6521bd6 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -33,6 +33,7 @@ #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp" #include "torch_npu/csrc/distributed/Init.h" #include "torch_npu/csrc/distributed/reducer.hpp" +#include "torch_npu/csrc/core/tensor_impl.h" namespace torch_npu { @@ -48,7 +49,7 @@ class BroadcastWork { public: inline std::vector cast_tensors(at::TensorList tensors) { static auto cast_back_to_ori_format = [](const at::Tensor &t) { - return t.npu_format_cast(t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); + return t.npu_format_cast(torch_npu::NPUTensorImpl::GetStorageInfo(t)->origin_format_); }; return c10::fmap(tensors, cast_back_to_ori_format); } diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 77c68f31ff..f508b7102c 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -27,6 +27,7 @@ #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp" #include "third_party/acl/inc/acl/acl.h" #include "third_party/acl/inc/acl/acl_base.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace c10d_npu { namespace { @@ -54,7 +55,7 @@ std::map hcclDataType = { }; int64_t physical_numel(at::Tensor self){ - auto sizes = self.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_; + auto sizes = torch_npu::NPUTensorImpl::GetStorageInfo(self)->storage_sizes_; int64_t n = 1; for (auto s : sizes) { n *= s; diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp index b31bd008d0..7cc571d677 100644 --- a/torch_npu/csrc/distributed/reducer.cpp +++ b/torch_npu/csrc/distributed/reducer.cpp @@ -29,7 +29,7 @@ #include #include #include - +#include "torch_npu/csrc/core/tensor_impl.h" #include "torch_npu/csrc/distributed/reducer.hpp" namespace c10d_npu { @@ -37,7 +37,7 @@ namespace { int64_t physical_numel(at::Tensor self){ - auto sizes = self.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_; + auto sizes = torch_npu::NPUTensorImpl::GetStorageInfo(self)->storage_sizes_; int64_t n = 1; for (auto s : sizes) { n *= s; @@ -439,10 +439,10 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) { // previous iterations, no copy is needed. if (!grad.is_alias_of(bucket_view)) { // make sure grad has the same format as variable - if (grad.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ != - variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_) { + if (torch_npu::NPUTensorImpl::GetStorageInfo(grad)->npu_format_ != + torch_npu::NPUTensorImpl::GetStorageInfo(variable)->npu_format_) { grad = grad.npu_format_cast( - variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); + torch_npu::NPUTensorImpl::GetStorageInfo(variable)->npu_format_); } this->copy_grad_to_bucket(grad, bucket_view); if (gradient_as_bucket_view_) { @@ -1074,7 +1074,7 @@ void Reducer::copy_bucket_to_grad( // (see torch/csrc/grad/AccumulateGrad.h) grad = at::empty_with_format(variable.sizes(), bucket_view.options(), - variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); + torch_npu::NPUTensorImpl::GetStorageInfo(variable)->npu_format_); grad.copy_memory_(bucket_view, true); } else { grad.copy_memory_(bucket_view, true); diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp index 484039ac0d..96abacbddf 100644 --- a/torch_npu/csrc/framework/FormatHelper.cpp +++ b/torch_npu/csrc/framework/FormatHelper.cpp @@ -63,7 +63,7 @@ namespace at_npu bool FormatHelper::IsPadded(const at::Tensor *tensor) { - auto format = tensor->storage().unsafeGetStorageImpl()->npu_desc_.npu_format_; + auto format = torch_npu::NPUTensorImpl::GetStorageInfo(*tensor)->npu_format_; return IsPadded(format); } @@ -91,7 +91,7 @@ namespace at_npu char *FormatHelper::GetFormatName(const at::Tensor &tensor) { - auto format = tensor.storage().get_npu_desc().npu_format_; + auto format = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->npu_format_; return GetFormatName(format); } @@ -114,7 +114,7 @@ namespace at_npu aclFormat FormatHelper::GetFormat(const at::Tensor &tensor) { - return tensor.storage().get_npu_desc().npu_format_; + return torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->npu_format_; } bool FormatHelper::IsBaseFormatType(aclFormat format) @@ -124,11 +124,11 @@ namespace at_npu bool FormatHelper::IsBaseFormatType(const at::Tensor &tensor) { - auto format = tensor.storage().get_npu_desc().npu_format_; + auto format = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->npu_format_; return IsBaseFormatType(format); } - FormatShape FormatHelper::GetStorageSizes(c10::NPUStorageDesc desc) + FormatShape FormatHelper::GetStorageSizes(c10::npu::NPUCachingAllocator::NPUStorageInfo desc) { auto ori_size = desc.base_sizes_; auto format = desc.npu_format_; diff --git a/torch_npu/csrc/framework/FormatHelper.h b/torch_npu/csrc/framework/FormatHelper.h index 2070eafe00..3ea056be10 100644 --- a/torch_npu/csrc/framework/FormatHelper.h +++ b/torch_npu/csrc/framework/FormatHelper.h @@ -20,6 +20,8 @@ #include #include "torch_npu/csrc/framework/utils/NPUDefinition.h" +#include "torch_npu/csrc/core/tensor_impl.h" +#include "c10/npu/NPUCachingAllocator.h" namespace at_npu { @@ -50,7 +52,7 @@ namespace at_npu template static FormatShape GetStorageSizes(aclFormat format, sizeType ori_size); // GetStorageSizes used to calculate the storage sizes of op at npu device at different format. - static FormatShape GetStorageSizes(c10::NPUStorageDesc desc); + static FormatShape GetStorageSizes(c10::npu::NPUCachingAllocator::NPUStorageInfo desc); private: static bool IsPadded(aclFormat format); diff --git a/torch_npu/csrc/framework/InferFormat.cpp b/torch_npu/csrc/framework/InferFormat.cpp index 6ef3c77742..74f3fc6355 100644 --- a/torch_npu/csrc/framework/InferFormat.cpp +++ b/torch_npu/csrc/framework/InferFormat.cpp @@ -25,16 +25,16 @@ namespace at_npu aclFormat InferFormat::GuessFormatWhenContiguous(const at::Tensor &tensor) { - auto desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_; + auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor); // fix: NCDHW -> default format - if ((desc.origin_format_ == ACL_FORMAT_NCDHW)) + if ((desc->origin_format_ == ACL_FORMAT_NCDHW)) { - if ((tensor.sizes().size() != desc.base_sizes_.size()) && (tensor.sizes().size() <= 4)) + if ((tensor.sizes().size() != desc->base_sizes_.size()) && (tensor.sizes().size() <= 4)) { return ACL_FORMAT_NCHW; } } - return desc.origin_format_; + return desc->origin_format_; } // NOTE: this method should cooperate with shape infer. @@ -111,7 +111,7 @@ namespace at_npu FormatShape InferFormat::GuessStorageSizeWhenConvertFormat(const at::Tensor &tensor) { auto format = FormatHelper::GetFormat(tensor); - auto size = tensor.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_; + auto size = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_; // TransData: ND->NZ, ND size < 2, we can expand dimension to 2, the storage have no effect. // now, only ND->NZ and NZ->ND will call transdata, so we no need to check other format. if ((size.size() < 2) && format == ACL_FORMAT_ND) diff --git a/torch_npu/csrc/framework/OpCmdHelper.cpp b/torch_npu/csrc/framework/OpCmdHelper.cpp index 4c926bef26..28e0c9672a 100644 --- a/torch_npu/csrc/framework/OpCmdHelper.cpp +++ b/torch_npu/csrc/framework/OpCmdHelper.cpp @@ -33,11 +33,11 @@ namespace at_npu at::ScalarType scalarDataType = tensor.scalar_type(); aclDataType aclDataType = CalcuOpUtil::convert_to_acl_data_type(scalarDataType, forceDataType); - const auto &npuDesc = tensor.storage().get_npu_desc(); - auto &storageDims = npuDesc.storage_sizes_; + const auto* npuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor); + auto &storageDims = npuDesc->storage_sizes_; AclTensorDescMaker desc; - auto aclDesc = desc.Create(aclDataType, npuDesc) - .SetFormat(npuDesc.npu_format_) + auto aclDesc = desc.Create(aclDataType, *npuDesc) + .SetFormat(npuDesc->npu_format_) .SetShape(storageDims) .SetName(descName) .SetConstAttr(cpu_tensor) @@ -47,7 +47,7 @@ namespace at_npu AclTensorBufferMaker buffer(tensor, numel); auto aclBuff = buffer.Get(); int64_t storageDim = storageDims.size(); - return std::tie(aclDesc, aclBuff, storageDim, npuDesc.npu_format_); + return std::tie(aclDesc, aclBuff, storageDim, npuDesc->npu_format_); } std::tuple OpCmdHelper::CovertTensorWithZeroDimToAclInput( @@ -127,19 +127,19 @@ namespace at_npu { aclDataType aclDataType = CalcuOpUtil::convert_to_acl_data_type( tensorPtr->scalar_type(), forceDataType); - const auto &npuDesc = tensorPtr->storage().get_npu_desc(); + const auto* npuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(*tensorPtr); const auto &dims = tensorPtr->sizes(); - auto &storageDims = npuDesc.storage_sizes_; + auto &storageDims = npuDesc->storage_sizes_; AclTensorDescMaker desc; - auto aclDesc = desc.Create(aclDataType, dims, npuDesc.origin_format_) - .SetFormat(npuDesc.npu_format_) + auto aclDesc = desc.Create(aclDataType, dims, npuDesc->origin_format_) + .SetFormat(npuDesc->npu_format_) .SetShape(storageDims) .Get(); auto numel = at::prod_intlist(storageDims); AclTensorBufferMaker aclBuffer(tensorPtr, numel); auto aclBuff = aclBuffer.Get(); int64_t storageDim = storageDims.size(); - return std::tie(aclDesc, aclBuff, storageDim, npuDesc.npu_format_); + return std::tie(aclDesc, aclBuff, storageDim, npuDesc->npu_format_); } std::tuple OpCmdHelper::CovertTransDataTensorToAcl( diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h index 89679fd6d9..7163626aaf 100644 --- a/torch_npu/csrc/framework/OpParamMaker.h +++ b/torch_npu/csrc/framework/OpParamMaker.h @@ -22,6 +22,8 @@ #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" #include "torch_npu/csrc/framework/NPUDefine.h" #include "torch_npu/csrc/framework/interface/Graph.h" +#include "torch_npu/csrc/core/tensor_impl.h" +#include "c10/npu/NPUCachingAllocator.h" namespace at_npu { @@ -66,7 +68,7 @@ namespace at_npu AclTensorDescMaker() {} ~AclTensorDescMaker() = default; - AclTensorDescMaker &Create(aclDataType dataType, c10::NPUStorageDesc storageDesc) + AclTensorDescMaker& Create(aclDataType dataType, c10::npu::NPUCachingAllocator::NPUStorageInfo storageDesc) { auto dims = storageDesc.base_sizes_; auto format = storageDesc.origin_format_; diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp index c23d1d4415..a43e24d28b 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.cpp +++ b/torch_npu/csrc/framework/StorageDescHelper.cpp @@ -16,6 +16,7 @@ #include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/InferFormat.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -24,8 +25,8 @@ namespace at_npu bool StorageDescHelper::MetaDataAreMatch(const at::Tensor *tensor) { - auto &desc = tensor->storage().unsafeGetStorageImpl()->npu_desc_; - return IsSameSize(desc.base_sizes_, tensor->sizes()) && IsSameSize(desc.base_strides_, tensor->strides()); + auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(*tensor); + return IsSameSize(desc->base_sizes_, tensor->sizes()) && IsSameSize(desc->base_strides_, tensor->strides()); } bool StorageDescHelper::OffsetAreMatch(const at::Tensor *tensor) @@ -34,7 +35,7 @@ namespace at_npu } // copy related - bool StorageDescHelper::IsSameDesc(const c10::NPUStorageDesc &a, const c10::NPUStorageDesc &b) + bool StorageDescHelper::IsSameDesc(const c10::npu::NPUCachingAllocator::NPUStorageInfo& a, const c10::npu::NPUCachingAllocator::NPUStorageInfo& b) { if ((a.origin_format_ != b.origin_format_) || (a.npu_format_ != b.npu_format_)) { @@ -48,9 +49,9 @@ namespace at_npu bool StorageDescHelper::IsSameDesc(const at::Tensor &a, const at::Tensor &b) { - auto descA = a.storage().unsafeGetStorageImpl()->npu_desc_; - auto descB = b.storage().unsafeGetStorageImpl()->npu_desc_; - return IsSameDesc(descA, descB); + auto* descA = torch_npu::NPUTensorImpl::GetStorageInfo(a); + auto* descB = torch_npu::NPUTensorImpl::GetStorageInfo(b);; + return IsSameDesc(*descA, *descB); } bool StorageDescHelper::IsSameSize(c10::SmallVector a, c10::IntArrayRef b) @@ -62,7 +63,7 @@ namespace at_npu return false; } - void StorageDescHelper::UpdateDesc(c10::NPUStorageDesc &npuDesc, c10::IntArrayRef &new_size) + void StorageDescHelper::UpdateDesc(c10::npu::NPUCachingAllocator::NPUStorageInfo& npuDesc, c10::IntArrayRef& new_size) { npuDesc.base_sizes_ = new_size; @@ -98,17 +99,28 @@ namespace at_npu void StorageDescHelper::SetDesc(at::Tensor &dst) { - dst.storage().unsafeGetStorageImpl()->npu_desc_ = SetDesc(); + if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) { + return; + } + *torch_npu::NPUTensorImpl::GetStorageInfo(dst) = SetDesc(); } - void StorageDescHelper::SetDesc(at::Tensor &dst, c10::IntArrayRef size, c10::IntArrayRef strides) - { - dst.storage().unsafeGetStorageImpl()->npu_desc_ = SetDesc(size, strides); + void StorageDescHelper::SetDesc(at::Tensor& dst, c10::IntArrayRef size, c10::IntArrayRef strides) + { + if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) { + return; + } + + *torch_npu::NPUTensorImpl::GetStorageInfo(dst) = SetDesc(size, strides); } void StorageDescHelper::SetDesc(at::Tensor &dst, c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format) { - dst.storage().unsafeGetStorageImpl()->npu_desc_ = SetDesc(size, strides, format); + if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) { + return; + } + + *torch_npu::NPUTensorImpl::GetStorageInfo(dst) = SetDesc(size, strides, format); } void StorageDescHelper::CopyDesc(at::Tensor &dst, const at::Tensor &src) @@ -118,36 +130,42 @@ namespace at_npu void StorageDescHelper::CopyDesc(at::Tensor &dst, const c10::Storage &src) { - CopyDesc(dst, src.unsafeGetStorageImpl()->npu_desc_); + CopyDesc(dst, *torch_npu::NPUTensorImpl::GetStorageInfo(*src.unsafeGetStorageImpl())); } - void StorageDescHelper::CopyDesc(const at::Tensor &dst, const c10::NPUStorageDesc &src_desc) - { - auto &dstDesc = dst.storage().unsafeGetStorageImpl()->npu_desc_; - dstDesc = src_desc; + void StorageDescHelper::CopyDesc(const at::Tensor& dst, const c10::npu::NPUCachingAllocator::NPUStorageInfo& src_desc) { + if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) { + return; + } + + auto* dstDesc = torch_npu::NPUTensorImpl::GetStorageInfo(dst); + *dstDesc = src_desc; } - void StorageDescHelper::ReflushDescBySelf(const at::Tensor &src) - { - auto &desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - desc.base_sizes_ = src.sizes(); - desc.storage_sizes_ = src.sizes(); - desc.base_strides_ = src.strides(); + void StorageDescHelper::ReflushDescBySelf(const at::Tensor& src) + { + if (src.storage().unsafeGetStorageImpl()->data() == nullptr) { + return; + } + auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); + desc->base_sizes_ = src.sizes(); + desc->storage_sizes_ = src.sizes(); + desc->base_strides_ = src.strides(); } - c10::NPUStorageDesc StorageDescHelper::SetDesc() + c10::npu::NPUCachingAllocator::NPUStorageInfo StorageDescHelper::SetDesc() { return SetDesc({0}, {}); } - c10::NPUStorageDesc StorageDescHelper::SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides) - { + c10::npu::NPUCachingAllocator::NPUStorageInfo StorageDescHelper::SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides) + { return SetDesc(size, strides, InferFormat::GuessBaseFormat(size)); } - c10::NPUStorageDesc StorageDescHelper::SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format) + c10::npu::NPUCachingAllocator::NPUStorageInfo StorageDescHelper::SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format) { - struct c10::NPUStorageDesc npu_desc; + struct c10::npu::NPUCachingAllocator::NPUStorageInfo::NPUStorageInfo npu_desc; npu_desc.base_sizes_ = size; npu_desc.base_strides_ = strides; // guess ori format and npu format unit by size and dst format @@ -162,7 +180,7 @@ namespace at_npu return npu_desc; } - int64_t StorageDescHelper::GetMemorySize(const c10::NPUStorageDesc &desc) + int64_t StorageDescHelper::GetMemorySize(const c10::npu::NPUCachingAllocator::NPUStorageInfo& desc) { auto physical_size = FormatHelper::GetStorageSizes(desc); return at::prod_intlist(physical_size); @@ -170,8 +188,8 @@ namespace at_npu int64_t StorageDescHelper::GetMemorySize(const at::Tensor &dst) { - auto desc = dst.storage().unsafeGetStorageImpl()->npu_desc_; - return GetMemorySize(desc); + auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(dst); + return GetMemorySize(*desc); } int64_t StorageDescHelper::GetMemorySize(c10::IntArrayRef size, aclFormat format) diff --git a/torch_npu/csrc/framework/StorageDescHelper.h b/torch_npu/csrc/framework/StorageDescHelper.h index 02e416acc2..1765f36143 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.h +++ b/torch_npu/csrc/framework/StorageDescHelper.h @@ -19,7 +19,7 @@ #include #include "torch_npu/csrc/framework/utils/NPUDefinition.h" - +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { namespace native @@ -35,7 +35,7 @@ namespace at_npu static bool OffsetAreMatch(const at::Tensor *tensor); // helper function of transdata op. - static bool IsSameDesc(const c10::NPUStorageDesc &a, const c10::NPUStorageDesc &b); + static bool IsSameDesc(const c10::npu::NPUCachingAllocator::NPUStorageInfo& a, const c10::npu::NPUCachingAllocator::NPUStorageInfo& b); static bool IsSameDesc(const at::Tensor &a, const at::Tensor &b); // calculate storage size need by npu memory @@ -52,9 +52,9 @@ namespace at_npu static void CopyDesc(at::Tensor &dst, const at::Tensor &src); static void CopyDesc(at::Tensor &dst, const c10::Storage &src); - static void CopyDesc(const at::Tensor &dst, const c10::NPUStorageDesc &src_desc); + static void CopyDesc(const at::Tensor& dst, const c10::npu::NPUCachingAllocator::NPUStorageInfo& src_desc); - static void UpdateDesc(c10::NPUStorageDesc &npuDesc, c10::IntArrayRef &new_size); + static void UpdateDesc(c10::npu::NPUCachingAllocator::NPUStorageInfo& npuDesc, c10::IntArrayRef& new_size); static FormatShape ComputeStrideFromShape(const FormatShape &shape); @@ -64,11 +64,11 @@ namespace at_npu private: // Get Part static bool IsSameSize(c10::SmallVector a, c10::IntArrayRef b); - static int64_t GetMemorySize(const c10::NPUStorageDesc &dst); + static int64_t GetMemorySize(const c10::npu::NPUCachingAllocator::NPUStorageInfo& dst); // Set Part - static c10::NPUStorageDesc SetDesc(); - static c10::NPUStorageDesc SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides); - static c10::NPUStorageDesc SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format); + static c10::npu::NPUCachingAllocator::NPUStorageInfo SetDesc(); + static c10::npu::NPUCachingAllocator::NPUStorageInfo SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides); + static c10::npu::NPUCachingAllocator::NPUStorageInfo SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format); }; } // namespace native diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp index 0854e27c09..cfee4e6557 100644 --- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp +++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp @@ -46,7 +46,7 @@ namespace at_npu } if (at::prod_intlist(tensor.sizes()) < - at::prod_intlist(tensor.storage().get_npu_desc().base_sizes_)) + at::prod_intlist(torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_)) { return {"slice", "select", "indexing"}; } @@ -61,7 +61,7 @@ namespace at_npu // 2. full memory copy: size match between src and self if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() && src.sizes().equals(self.sizes()) && - self.sizes().equals(self.storage().get_npu_desc().base_sizes_)) + self.sizes().equals(torch_npu::NPUTensorImpl::GetStorageInfo(self)->base_sizes_)) { return true; } @@ -112,7 +112,7 @@ namespace at_npu auto self = at::empty_with_format( src.sizes(), src.options(), - src.storage().get_npu_desc().npu_format_); + torch_npu::NPUTensorImpl::GetStorageInfo(src)->npu_format_); if (ContiguousOptimizeWithAnyFormat(self, src, optimizations)) { return self; diff --git a/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp b/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp index db562ee712..3607128210 100644 --- a/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp +++ b/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp @@ -22,7 +22,7 @@ namespace at_npu bool can_use_memecpy_for_NZ_format(const at::Tensor &tensor) { - auto base_size = tensor.storage().get_npu_desc().base_sizes_; + auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_; // Make sure that sizes of last 2 dims don't change if (tensor.size(-1) != base_size[base_size.size() - 1] || tensor.size(-2) != base_size[base_size.size() - 2]) @@ -39,8 +39,8 @@ namespace at_npu { return false; } - auto srcNpuDesc = src.storage().get_npu_desc(); - switch (srcNpuDesc.npu_format_) + auto* srcNpuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(src); + switch(srcNpuDesc->npu_format_) { case ACL_FORMAT_FRACTAL_NZ: return can_use_memecpy_for_NZ_format(src); @@ -48,12 +48,12 @@ namespace at_npu default: // For other format, make sure that copy the whole memory. // Moreover, storage size expanding caused by padding could be avoided - if (!(srcNpuDesc.base_sizes_ == array_to_small_vector(src.sizes()))) + if (!(srcNpuDesc->base_sizes_ == array_to_small_vector(src.sizes()))) { return false; } // Make sure no pandding happens - if (src.numel() != at::prod_intlist(srcNpuDesc.storage_sizes_)) + if (src.numel() != at::prod_intlist(srcNpuDesc->storage_sizes_)) { return false; } diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp index cf270817f8..cc9ae17e2f 100644 --- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp @@ -18,6 +18,7 @@ #include #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -50,10 +51,10 @@ namespace at_npu { RECORD_FUNCTION("npuCombined", std::vector({src})); // Record src infos for recovering after trans-contiguous - const auto &src_npu_desc = src.storage().get_npu_desc(); + const auto* src_npu_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); // Construct base tensor(contiguous) - at::Tensor base_tensor = at::empty(src_npu_desc.base_sizes_, src.options()); + at::Tensor base_tensor = at::empty(src_npu_desc->base_sizes_, src.options()); base_tensor.set_(src.storage()); // Reconstruct combined discontiguous tensor ==trans==> contiguous tensor @@ -61,7 +62,7 @@ namespace at_npu combined_to_contiguous(base_tensor, self, viewInfos, viewOffsets); // Recover modified tensor infos of src after trans-contiguous - StorageDescHelper::CopyDesc(base_tensor, src_npu_desc); + StorageDescHelper::CopyDesc(base_tensor, *src_npu_desc); return contiguousOrNot; } return false; @@ -88,10 +89,10 @@ namespace at_npu { return false; } - auto npu_desc = tensor.storage().get_npu_desc(); + auto* npu_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor); - if ((at::prod_intlist(tensor.sizes()) != at::prod_intlist(npu_desc.base_sizes_)) || - (tensor.storage_offset() != npu_desc.base_offset_)) + if ((at::prod_intlist(tensor.sizes()) != at::prod_intlist(npu_desc->base_sizes_)) || + (tensor.storage_offset() != npu_desc->base_offset_)) { return false; } @@ -153,7 +154,7 @@ namespace at_npu return false; } // Avoid combined-cases such as squeeze+indexing at the first axis. - if (tensor.strides()[0] != tensor.storage().get_npu_desc().base_strides_[0]) + if(tensor.strides()[0] != torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_strides_[0]) { return false; } @@ -166,7 +167,7 @@ namespace at_npu { // tensors with reduced numel will be taken into consideration. if (at::prod_intlist(tensor.sizes()) < - at::prod_intlist(tensor.storage().get_npu_desc().base_sizes_)) + at::prod_intlist(torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_)) { for (auto i = 0; i < tensor.sizes().size() - 2; i++) { @@ -192,8 +193,8 @@ namespace at_npu FormatShape &infer_stride, int64_t &infer_offset) { - auto base_sizes = src.storage().get_npu_desc().base_sizes_; - auto base_strides = src.storage().get_npu_desc().base_strides_; + auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; + auto base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_; auto view_sizes = array_to_small_vector(src.sizes()); auto view_strides = array_to_small_vector(src.strides()); @@ -343,7 +344,6 @@ namespace at_npu { return false; } - auto tensor_desc = tensor.storage().get_npu_desc(); c10::SmallVector view_info_part; view_info_part.emplace_back(array_to_small_vector(tensor.sizes())); view_info_part.emplace_back(array_to_small_vector(tensor.strides())); @@ -366,8 +366,8 @@ namespace at_npu return false; } - auto combined_base_sizes = src.storage().get_npu_desc().base_sizes_; - auto combined_base_strides = src.storage().get_npu_desc().base_strides_; + auto combined_base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; + auto combined_base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_; // Key infos that should be inferred. FormatShape infer_size; @@ -501,7 +501,7 @@ namespace at_npu auto contiguous_src = at::empty_with_format( src.sizes(), src.options(), - src.storage().get_npu_desc().npu_format_); + torch_npu::NPUTensorImpl::GetStorageInfo(src)->npu_format_); return ( copy_optimize_contiguous_by_given_cases( src, contiguous_src, optimizations_first) && diff --git a/torch_npu/csrc/framework/contiguous/indexing_opt.cpp b/torch_npu/csrc/framework/contiguous/indexing_opt.cpp index ff4dc0683a..ab7d0e3fad 100644 --- a/torch_npu/csrc/framework/contiguous/indexing_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/indexing_opt.cpp @@ -15,6 +15,7 @@ #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -50,20 +51,20 @@ namespace at_npu { return false; } - auto src_desc = src.storage().get_npu_desc(); - if (src.numel() >= at::prod_intlist(src_desc.base_sizes_)) + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); + if (src.numel() >= at::prod_intlist(src_desc->base_sizes_)) { return false; } - if (src.dim() != src_desc.base_sizes_.size() || - src.strides().size() != src_desc.base_strides_.size()) + if (src.dim() != src_desc->base_sizes_.size() || + src.strides().size() != src_desc->base_strides_.size()) { return false; } - auto base_size = src.storage().get_npu_desc().base_sizes_; - auto base_stride = src.storage().get_npu_desc().base_strides_; + auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; + auto base_stride = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_; // indexing信息获取部分 // Get step info(for indexing step at index aixs should > 1) @@ -140,7 +141,7 @@ namespace at_npu c10::SmallVector &step) { - auto base_size = src.storage().get_npu_desc().base_sizes_; + auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; // recover contiguous base tensor at::Tensor temp_src = at::empty(base_size, src.options()); diff --git a/torch_npu/csrc/framework/contiguous/permute_opt.cpp b/torch_npu/csrc/framework/contiguous/permute_opt.cpp index 238017c21d..0506c82b46 100644 --- a/torch_npu/csrc/framework/contiguous/permute_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/permute_opt.cpp @@ -16,6 +16,7 @@ #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -37,13 +38,12 @@ namespace at_npu // create contiguous tensor for npu transpose at::Tensor temp_src = at::empty(sizes, src.options()); temp_src.set_(src.storage(), temp_src.storage_offset(), temp_src.sizes(), temp_src.strides()); - auto npu_desc = temp_src.storage().unsafeGetStorageImpl()->npu_desc_; - temp_src.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_ = temp_src.sizes(); - temp_src.storage().unsafeGetStorageImpl()->npu_desc_.base_strides_ = temp_src.strides(); - temp_src.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_ = temp_src.sizes(); + + auto npu_desc = *torch_npu::NPUTensorImpl::GetStorageInfo(temp_src); + StorageDescHelper::ReflushDescBySelf(temp_src); NPUNativeFunctions::npu_transpose_out(temp_src, perm, self); - temp_src.storage().unsafeGetStorageImpl()->npu_desc_ = npu_desc; + *torch_npu::NPUTensorImpl::GetStorageInfo(temp_src) = npu_desc; return true; } return false; @@ -67,8 +67,8 @@ namespace at_npu return false; } - auto base_sizes = src.storage().get_npu_desc().base_sizes_; - auto base_strides = src.storage().get_npu_desc().base_strides_; + auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; + auto base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_; auto view_sizes = array_to_small_vector(src.sizes()); auto view_strides = array_to_small_vector(src.strides()); c10::SmallVector indexes; diff --git a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp index 8f2e9cd162..27c3c88b87 100644 --- a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp @@ -36,7 +36,7 @@ namespace at_npu copy_d2d_by_memcpy( self, src, - at::prod_intlist(self.storage().get_npu_desc().storage_sizes_)); + at::prod_intlist(torch_npu::NPUTensorImpl::GetStorageInfo(self)->storage_sizes_)); return true; } return false; @@ -100,13 +100,13 @@ namespace at_npu bool can_use_memory_repoint(const at::Tensor &tensor) { - auto tensorNpuDesc = tensor.storage().get_npu_desc(); + auto* tensorNpuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor); if (FormatHelper::IsBaseFormatType(tensor)) { return true; } - if (tensorNpuDesc.npu_format_ == ACL_FORMAT_FRACTAL_NZ) + if (tensorNpuDesc->npu_format_ == ACL_FORMAT_FRACTAL_NZ) { // No padding if ((tensor.size(-1) % 16 == 0) && (tensor.size(-2) % 16 == 0)) diff --git a/torch_npu/csrc/framework/contiguous/reshape_opt.cpp b/torch_npu/csrc/framework/contiguous/reshape_opt.cpp index 73bc89d1f9..789254d946 100644 --- a/torch_npu/csrc/framework/contiguous/reshape_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/reshape_opt.cpp @@ -28,7 +28,7 @@ namespace at_npu if (check_reshape_match(src, self)) { RECORD_FUNCTION("View_d2dCopyAsync", std::vector({src})); - copy_d2d_by_memcpy(self, src, at::prod_intlist(self.storage().get_npu_desc().storage_sizes_)); + copy_d2d_by_memcpy(self, src, at::prod_intlist(torch_npu::NPUTensorImpl::GetStorageInfo(self)->storage_sizes_)); return true; } return false; diff --git a/torch_npu/csrc/framework/contiguous/select_opt.cpp b/torch_npu/csrc/framework/contiguous/select_opt.cpp index b662ba85cc..9767a91ea3 100644 --- a/torch_npu/csrc/framework/contiguous/select_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/select_opt.cpp @@ -15,6 +15,7 @@ #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -59,8 +60,8 @@ namespace at_npu return false; } // base info and src info - auto base_size = src.storage().get_npu_desc().base_sizes_; - auto base_stride = src.storage().get_npu_desc().base_strides_; + auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; + auto base_stride = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_; auto select_size = src.sizes(); auto select_stride = src.strides(); @@ -145,7 +146,7 @@ namespace at_npu c10::SmallVector &start, c10::SmallVector &length) { - auto base_size = src.storage().get_npu_desc().base_sizes_; + auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; // Recover base tensor(necessary) a = b.select(1, 1) at::Tensor temp_src = at::empty(base_size, src.options()); diff --git a/torch_npu/csrc/framework/contiguous/slice_opt.cpp b/torch_npu/csrc/framework/contiguous/slice_opt.cpp index 0eeea31d2c..876dd0890f 100644 --- a/torch_npu/csrc/framework/contiguous/slice_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/slice_opt.cpp @@ -16,7 +16,7 @@ #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" - +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { namespace native @@ -61,8 +61,8 @@ namespace at_npu return false; } - auto base_sizes = src.storage().get_npu_desc().base_sizes_; - auto base_strides = src.storage().get_npu_desc().base_strides_; + auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; + auto base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_; auto view_sizes = array_to_small_vector(src.sizes()); auto view_strides = array_to_small_vector(src.strides()); @@ -144,7 +144,7 @@ namespace at_npu const c10::SmallVector &size) { // create contiguous tensor for npu slice - auto temp_tensor_size = src.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_; + auto temp_tensor_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; at::Tensor temp_src = at::empty(temp_tensor_size, src.options()); temp_src.set_(src.storage(), temp_src.storage_offset(), temp_src.sizes(), temp_src.strides()); diff --git a/torch_npu/csrc/framework/contiguous/unfold_opt.cpp b/torch_npu/csrc/framework/contiguous/unfold_opt.cpp index e0909846b7..02c43112f4 100644 --- a/torch_npu/csrc/framework/contiguous/unfold_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/unfold_opt.cpp @@ -15,6 +15,7 @@ #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -60,8 +61,8 @@ namespace at_npu return false; } - auto base_sizes = src.storage().get_npu_desc().base_sizes_; - auto base_strides = src.storage().get_npu_desc().base_strides_; + auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; + auto base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_; auto view_sizes = array_to_small_vector(src.sizes()); auto view_strides = array_to_small_vector(src.strides()); @@ -126,7 +127,7 @@ namespace at_npu int64_t &fold_step) { - auto base_sizes = src.storage().get_npu_desc().base_sizes_; + auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_; TORCH_CHECK(fold_size != 0, "size should not be 0"); int64_t split_nums = base_sizes[fold_dimension] / fold_size; -- Gitee From 0c256cdb50dff58177eaac21866dd4da8d11d45e Mon Sep 17 00:00:00 2001 From: "zhousinan@huawei.com" Date: Tue, 25 Jan 2022 17:04:26 +0800 Subject: [PATCH 2/6] add torch_npu/csrc/framework/utils/CalcuOpUtil.cpp --- .../csrc/framework/utils/CalcuOpUtil.cpp | 34 +++++----- torch_npu/csrc/framework/utils/NpuUtils.cpp | 68 ++++++++----------- .../csrc/framework/utils/OpPreparation.cpp | 9 +-- 3 files changed, 52 insertions(+), 59 deletions(-) diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp index a847435adc..7a65518a77 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp @@ -27,6 +27,7 @@ #include "torch_npu/csrc/framework/utils/NpuFuzzyBlacklist.h" #include "torch_npu/csrc/framework/interface/EnvVariables.h" #include "third_party/acl/inc/acl/acl_base.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -191,8 +192,7 @@ namespace at_npu { if (NpuUtils::check_match(&tensor) || NpuUtils::check_5d_5d_match(tensor)) { - auto tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_; - return tensor_desc.npu_format_; + return torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->npu_format_;; } else { @@ -260,7 +260,7 @@ namespace at_npu return false; } int64_t numel = 1; - auto storageSize = tensor.storage().get_npu_desc().storage_sizes_; + auto storageSize = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->storage_sizes_; for (int i = 0; i < storageSize.size(); i++) { @@ -270,12 +270,12 @@ namespace at_npu int64_t dim1 = tensor.dim() - 1; int64_t dim2 = tensor.dim() - 2; - auto tensor_desc = tensor.storage().get_npu_desc(); + auto* tensor_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor); if (tensor.stride(dim2) == 1 && tensor.stride(dim1) == tensor.size(dim2) && - tensor.size(dim1) == tensor_desc.base_sizes_[dim2] && - tensor.size(dim2) == tensor_desc.base_sizes_[dim1] && - tensor.numel() == numel && - tensor_desc.base_sizes_.size() == tensor.dim()) + tensor.size(dim1) == tensor_desc->base_sizes_[dim2] && + tensor.size(dim2) == tensor_desc->base_sizes_[dim1] && + tensor.numel() == numel && + tensor_desc->base_sizes_.size() == tensor.dim()) { return true; } @@ -416,8 +416,8 @@ namespace at_npu { at::Tensor *aclInput = &input[i].tensor; c10::SmallVector dims; - dims = aclInput->storage().get_npu_desc().base_sizes_; - auto storageDims = aclInput->storage().get_npu_desc().storage_sizes_; + dims = torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->base_sizes_; + auto storageDims = torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->storage_sizes_; int64_t numel = 1; for (int j = 0; j < storageDims.size(); j++) { @@ -428,9 +428,9 @@ namespace at_npu aclDataType, dims.size(), dims.data(), - aclInput->storage().get_npu_desc().origin_format_); + torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->origin_format_); aclSetTensorFormat( - acl_tensor_desc, aclInput->storage().get_npu_desc().npu_format_); + acl_tensor_desc, torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->npu_format_); aclSetTensorShape( acl_tensor_desc, storageDims.size(), storageDims.data()); if (input[i].tensorDescName != "") @@ -441,7 +441,7 @@ namespace at_npu aclDataInputBuffArr[i] = aclCreateDataBuffer( (void *)(aclInput->data_ptr()), aclInput->itemsize() * numel); inputDimsArr[i] = storageDims.size(); - inputFormatsArr[i] = aclInput->storage().get_npu_desc().npu_format_; + inputFormatsArr[i] = torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->npu_format_; } else if ( input[i].tensorDescType == @@ -486,7 +486,7 @@ namespace at_npu aclOutput->scalar_type(), output[i].realDataType); auto dims = aclOutput->sizes(); - auto storageDims = aclOutput->storage().get_npu_desc().storage_sizes_; + auto storageDims = torch_npu::NPUTensorImpl::GetStorageInfo(*aclOutput)->storage_sizes_; int64_t numel = 1; for (int j = 0; j < storageDims.size(); j++) { @@ -497,16 +497,16 @@ namespace at_npu aclDataType, dims.size(), dims.data(), - aclOutput->storage().get_npu_desc().origin_format_); + torch_npu::NPUTensorImpl::GetStorageInfo(*aclOutput)->origin_format_); aclSetTensorFormat( - acl_tensor_desc, aclOutput->storage().get_npu_desc().npu_format_); + acl_tensor_desc, torch_npu::NPUTensorImpl::GetStorageInfo(*aclOutput)->npu_format_); aclSetTensorShape( acl_tensor_desc, storageDims.size(), storageDims.data()); aclTensorOutputDescArr[i] = acl_tensor_desc; aclDataOutputBuffArr[i] = aclCreateDataBuffer( (void *)aclOutput->data_ptr(), aclOutput->itemsize() * numel); outputDimsArr[i] = storageDims.size(); - outputFormatsArr[i] = aclOutput->storage().get_npu_desc().npu_format_; + outputFormatsArr[i] = torch_npu::NPUTensorImpl::GetStorageInfo(*aclOutput)->npu_format_; } params.input_num = inputNum; diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp index a0efe852f7..df7e363511 100644 --- a/torch_npu/csrc/framework/utils/NpuUtils.cpp +++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp @@ -87,15 +87,14 @@ namespace at_npu // (2) 4d format situation, only uncontiguous in Channel size // (3) size and start point must be 16*, make sure the memory be contiguous // std::cout<<"step in check5d5d Match."<npu_format_ != ACL_FORMAT_NC1HWC0) { return false; } @@ -129,9 +128,9 @@ namespace at_npu int64_t contiguous_len = 16; int64_t c0_len = 16; - for (int i = 2; i < npuDesc.base_sizes_.size(); i++) + for (int i = 2; i < npuDesc->base_sizes_.size(); i++) { - contiguous_len *= npuDesc.base_sizes_[i]; + contiguous_len *= npuDesc->base_sizes_[i]; } bool is_offset_match = (tensor.storage_offset() % contiguous_len == 0); bool is_length_match = (tensor.size(1) % c0_len == 0); @@ -151,12 +150,12 @@ namespace at_npu int64_t dim = 1; // 2. recovery the src tensor desc - const c10::NPUStorageDesc src_npuDesc = src.storage().get_npu_desc(); - src.set_(src.storage(), 0, src_npuDesc.base_sizes_, src_npuDesc.base_strides_); - at::Tensor src_tmp = src.reshape({src.size(0), src.size(1) / 16, src.size(2), src.size(3) * 16}); - src_tmp.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_ = src_tmp.sizes(); - src_tmp.storage().unsafeGetStorageImpl()->npu_desc_.base_strides_ = src_tmp.strides(); - src_tmp.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_ = src_tmp.sizes(); + const auto* src_npuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(src); + + src.set_(src.storage(), 0, src_npuDesc->base_sizes_, src_npuDesc->base_strides_); + at::Tensor src_tmp = src.reshape({src.size(0),src.size(1)/16,src.size(2),src.size(3)*16}); + StorageDescHelper::ReflushDescBySelf(src_tmp); + // std::cout << "src_tmp storage_offset(): " << src_tmp.storage_offset() << std::endl; // std::cout << "src_tmp sizes(): " << src_tmp.sizes() << std::endl; // std::cout << "src_tmp strides(): " << src_tmp.strides() << std::endl; @@ -192,23 +191,20 @@ namespace at_npu } void NpuUtils::RefreshFormat(const at::Tensor &tensor) { - auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_; - if (tensor_desc.storage_sizes_.size() == 4 && tensor_desc.npu_format_ == ACL_FORMAT_ND) - { - tensor_desc.npu_format_ = ACL_FORMAT_NCHW; - tensor_desc.origin_format_ = ACL_FORMAT_NCHW; - } - else if (tensor_desc.storage_sizes_.size() != 4 && tensor_desc.npu_format_ == ACL_FORMAT_NCHW) - { - tensor_desc.npu_format_ = ACL_FORMAT_ND; - tensor_desc.origin_format_ = ACL_FORMAT_ND; + auto* tensor_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor); + if (tensor_desc->storage_sizes_.size() == 4 && tensor_desc->npu_format_ == ACL_FORMAT_ND) { + tensor_desc->npu_format_ = ACL_FORMAT_NCHW; + tensor_desc->origin_format_ = ACL_FORMAT_NCHW; + } else if (tensor_desc->storage_sizes_.size() != 4 && tensor_desc->npu_format_ == ACL_FORMAT_NCHW) { + tensor_desc->npu_format_ = ACL_FORMAT_ND; + tensor_desc->origin_format_ = ACL_FORMAT_ND; } } at::Tensor deal_with_5d_5d_match(const at::Tensor &src) { - auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - at::Tensor src_new = at::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); + at::Tensor src_new = at::empty_with_format(src_desc->base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream(); int64_t numel = src_new.numel(); aclError error = aclrtMemcpyAsync( @@ -220,14 +216,15 @@ namespace at_npu copy_stream); src_new.set_(src_new.storage(), src.storage_offset(), src.sizes(), src.strides()); - src_new.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ = ACL_FORMAT_NCHW; + torch_npu::NPUTensorImpl::GetStorageInfo(src_new)->npu_format_ = ACL_FORMAT_NCHW; + at::Tensor ret = convert_continue_using_gatherv2_improve(src_new); + // std::cout << "ret data_recovery: " << ret.to(at::kCPU) << std::endl; - at::Tensor ret_tmp = ret.reshape({ret.size(0), ret.size(1) * 16, ret.size(2), ret.size(3) / 16}); - ret_tmp.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_ = ret_tmp.sizes(); - ret_tmp.storage().unsafeGetStorageImpl()->npu_desc_.base_strides_ = ret_tmp.strides(); - ret_tmp.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_ = ret_tmp.sizes(); - ret_tmp.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ = ACL_FORMAT_NC1HWC0; + at::Tensor ret_tmp = ret.reshape({ret.size(0),ret.size(1)*16,ret.size(2),ret.size(3)/16}); + + StorageDescHelper::ReflushDescBySelf(ret_tmp); + torch_npu::NPUTensorImpl::GetStorageInfo(ret_tmp)->npu_format_ = ACL_FORMAT_NC1HWC0; return ret_tmp; } @@ -245,19 +242,14 @@ namespace at_npu at::Tensor metadata_convert_match(const at::Tensor &src) { - auto &src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - bool numelEq = (src.numel() == at::prod_intlist(src_desc.base_sizes_)); + auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src); + bool numelEq = (src.numel() == at::prod_intlist(src_desc->base_sizes_)); // Only when a tensor monopolizes a storage can NpuStorageDesc be refreshed. // When the original format is not NCHW, the npu_format_cast to NCHW will generate // a temporary tensor, which always monopolizes its own storage. - if (numelEq && (!FormatHelper::IsBaseFormatType(src))) - { + if (numelEq && (!FormatHelper::IsBaseFormatType(src))) { at::Tensor tempTensor = at::npu_format_cast(src, FormatHelper::GetBaseFormat(src)); - auto &temp_desc = - tempTensor.storage().unsafeGetStorageImpl()->npu_desc_; - temp_desc.base_sizes_ = tempTensor.sizes(); - temp_desc.base_strides_ = tempTensor.strides(); - temp_desc.storage_sizes_ = tempTensor.sizes(); + StorageDescHelper::ReflushDescBySelf(tempTensor); NpuUtils::RefreshFormat(tempTensor); return tempTensor; } diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index bf70fb7277..cc09d545a6 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -17,6 +17,7 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/InferFormat.h" #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { @@ -189,15 +190,15 @@ namespace at_npu at::Tensor OpPreparation::CastBackToOriFormat(const at::Tensor &tensor) { - auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_; - auto ret = NPUNativeFunctions::npu_format_cast(tensor, tensor_desc.origin_format_); + auto* tensor_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor); + auto ret = NPUNativeFunctions::npu_format_cast(tensor, tensor_desc->origin_format_); return ret; } at::Tensor &OpPreparation::CastBackToOriFormat(at::Tensor &tensor) { - auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_; - tensor.npu_format_cast_(tensor_desc.origin_format_); + auto* tensor_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor); + tensor.npu_format_cast_(tensor_desc->origin_format_); return tensor; } -- Gitee From bb578c99cfc5f33dc5b47fc90307845bae4b0956 Mon Sep 17 00:00:00 2001 From: "zhousinan@huawei.com" Date: Fri, 28 Jan 2022 15:14:43 +0800 Subject: [PATCH 3/6] add torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp --- torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp | 4 ++-- torch_npu/csrc/framework/OpParamMaker.h | 2 +- torch_npu/csrc/framework/StorageDescHelper.cpp | 9 +++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp index 900c664fc5..38c30fdf9e 100644 --- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp @@ -66,9 +66,9 @@ namespace at_npu outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // use 5HD in Relu - if ((torch_npu::NPUTensorImpl::GetStorageInfo(grad_output)->npu_format_ == + if ((torch_npu::NPUTensorImpl::GetStorageInfo(grad_output)->npu_format_ == ACL_FORMAT_NCHW) && - (torch_npu::NPUTensorImpl::GetStorageInfo(self)->npu_format_ == + (torch_npu::NPUTensorImpl::GetStorageInfo(self)->npu_format_ == ACL_FORMAT_NC1HWC0)) { at::Tensor grad_output_5HD = diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h index 7163626aaf..fea67d7497 100644 --- a/torch_npu/csrc/framework/OpParamMaker.h +++ b/torch_npu/csrc/framework/OpParamMaker.h @@ -17,13 +17,13 @@ #define __PULGIN_NATIVE_UTILS_OP_PARAM_MAKER__ #include +#include #include "third_party/acl/inc/acl/acl_base.h" #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" #include "torch_npu/csrc/framework/NPUDefine.h" #include "torch_npu/csrc/framework/interface/Graph.h" #include "torch_npu/csrc/core/tensor_impl.h" -#include "c10/npu/NPUCachingAllocator.h" namespace at_npu { diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp index a43e24d28b..e438cdb7e1 100644 --- a/torch_npu/csrc/framework/StorageDescHelper.cpp +++ b/torch_npu/csrc/framework/StorageDescHelper.cpp @@ -106,7 +106,7 @@ namespace at_npu } void StorageDescHelper::SetDesc(at::Tensor& dst, c10::IntArrayRef size, c10::IntArrayRef strides) - { + { if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) { return; } @@ -116,7 +116,7 @@ namespace at_npu void StorageDescHelper::SetDesc(at::Tensor &dst, c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format) { - if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) { + if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) { return; } @@ -133,7 +133,8 @@ namespace at_npu CopyDesc(dst, *torch_npu::NPUTensorImpl::GetStorageInfo(*src.unsafeGetStorageImpl())); } - void StorageDescHelper::CopyDesc(const at::Tensor& dst, const c10::npu::NPUCachingAllocator::NPUStorageInfo& src_desc) { + void StorageDescHelper::CopyDesc(const at::Tensor& dst, const c10::npu::NPUCachingAllocator::NPUStorageInfo& src_desc) + { if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) { return; } @@ -143,7 +144,7 @@ namespace at_npu } void StorageDescHelper::ReflushDescBySelf(const at::Tensor& src) - { + { if (src.storage().unsafeGetStorageImpl()->data() == nullptr) { return; } -- Gitee From b4966d023d890f2b9867e4a6279af0e4dbc44ccc Mon Sep 17 00:00:00 2001 From: "zhousinan@huawei.com" Date: Sat, 29 Jan 2022 10:07:04 +0800 Subject: [PATCH 4/6] add torch_npu/csrc/framework/OpParamMaker.h --- torch_npu/csrc/framework/OpParamMaker.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h index e2b18e9309..37ab2b2bb3 100644 --- a/torch_npu/csrc/framework/OpParamMaker.h +++ b/torch_npu/csrc/framework/OpParamMaker.h @@ -21,6 +21,7 @@ #include "third_party/acl/inc/acl/acl_base.h" #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" +#include "torch_npu/csrc/framework/NPUDefine.h" #include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu @@ -66,7 +67,7 @@ namespace at_npu AclTensorDescMaker() {} ~AclTensorDescMaker() = default; - AclTensorDescMaker &Create(aclDataType dataType, c10::NPUStorageDesc storageDesc) + AclTensorDescMaker &Create(aclDataType dataType, c10::npu::NPUCachingAllocator::NPUStorageDesc storageDesc) { auto dims = storageDesc.base_sizes_; auto format = storageDesc.origin_format_; -- Gitee From 8ff1f92cb8ded363abeb6161c55219c9b6669c88 Mon Sep 17 00:00:00 2001 From: "zhousinan@huawei.com" Date: Sat, 29 Jan 2022 10:11:21 +0800 Subject: [PATCH 5/6] add torch_npu/csrc/framework/OpParamMaker.h --- torch_npu/csrc/framework/OpParamMaker.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h index 37ab2b2bb3..545d658b19 100644 --- a/torch_npu/csrc/framework/OpParamMaker.h +++ b/torch_npu/csrc/framework/OpParamMaker.h @@ -67,7 +67,7 @@ namespace at_npu AclTensorDescMaker() {} ~AclTensorDescMaker() = default; - AclTensorDescMaker &Create(aclDataType dataType, c10::npu::NPUCachingAllocator::NPUStorageDesc storageDesc) + AclTensorDescMaker &Create(aclDataType dataType, c10::npu::NPUCachingAllocator::NPUStorageInfo storageDesc) { auto dims = storageDesc.base_sizes_; auto format = storageDesc.origin_format_; -- Gitee From 27d25fadb09f91f1fa281f7415806ff22f3acfb3 Mon Sep 17 00:00:00 2001 From: "zhousinan@huawei.com" Date: Mon, 7 Feb 2022 15:12:22 +0800 Subject: [PATCH 6/6] add torch_npu/csrc/aten/common/TensorShape.cpp --- torch_npu/csrc/aten/common/TensorShape.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/torch_npu/csrc/aten/common/TensorShape.cpp b/torch_npu/csrc/aten/common/TensorShape.cpp index 4b87048538..ccb29842e4 100644 --- a/torch_npu/csrc/aten/common/TensorShape.cpp +++ b/torch_npu/csrc/aten/common/TensorShape.cpp @@ -33,6 +33,7 @@ #include "torch_npu/csrc/framework/InferFormat.h" #include "torch_npu/csrc/aten/common/FormatCastHelper.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/tensor_impl.h" namespace at_npu { namespace native { @@ -52,9 +53,8 @@ at::Tensor alias_with_sizes_and_strides_npu( impl->set_sizes_and_strides(sizes, strides); self_ = at::Tensor(std::move(impl)); } else { - auto impl = c10::make_intrusive( + auto impl = c10::make_intrusive( c10::Storage(self.storage()), - self.key_set(), self.dtype()); impl->set_storage_offset(self.storage_offset()); impl->set_sizes_and_strides(sizes, strides); @@ -91,9 +91,8 @@ at::Tensor NPUNativeFunctions::as_strided( dst = FormatCastHelper::ApplyBaseFormatTensorBy(dst); } auto storage_offset = storage_offset_.value_or(dst.storage_offset()); - auto result = at::detail::make_tensor( + auto result = at::detail::make_tensor( c10::Storage(dst.storage()), - dst.key_set(), dst.dtype()); at::native::setStrided(result, size, stride, storage_offset); return result; -- Gitee