From 65817da2c10d71237deac5a158d6b8abcdfaf427 Mon Sep 17 00:00:00 2001
From: "zhousinan@huawei.com" <zhousinan@huawei.com>
Date: Tue, 25 Jan 2022 16:55:48 +0800
Subject: [PATCH 1/6] add torch_npu/csrc/aten/common/CopyKernelNpu.cpp

---
 torch_npu/csrc/aten/common/CopyKernelNpu.cpp  |  8 +-
 .../csrc/aten/common/CopyMemoryKernel.cpp     | 12 +--
 .../csrc/aten/common/FormatCastHelper.cpp     | 13 +--
 .../csrc/aten/common/FormatCastKernelNpu.cpp  | 19 ++---
 .../csrc/aten/common/LocalScalarDenseNpu.cpp  |  5 +-
 torch_npu/csrc/aten/common/ResizeNpu.h        |  4 +-
 .../csrc/aten/common/TensorFactories.cpp      |  7 +-
 torch_npu/csrc/aten/ops/AddKernelNpu.cpp      |  5 +-
 torch_npu/csrc/aten/ops/MmKernelNpu.cpp       | 13 +--
 .../aten/ops/ThresholdBackwardKernelNpu.cpp   |  4 +-
 torch_npu/csrc/core/tensor_impl.cpp           | 12 +++
 torch_npu/csrc/core/tensor_impl.h             |  6 ++
 torch_npu/csrc/distributed/Init.cpp           |  3 +-
 .../csrc/distributed/ProcessGroupHCCL.cpp     |  3 +-
 torch_npu/csrc/distributed/reducer.cpp        | 12 +--
 torch_npu/csrc/framework/FormatHelper.cpp     | 10 +--
 torch_npu/csrc/framework/FormatHelper.h       |  4 +-
 torch_npu/csrc/framework/InferFormat.cpp      | 10 +--
 torch_npu/csrc/framework/OpCmdHelper.cpp      | 20 ++---
 torch_npu/csrc/framework/OpParamMaker.h       |  4 +-
 .../csrc/framework/StorageDescHelper.cpp      | 80 ++++++++++++-------
 torch_npu/csrc/framework/StorageDescHelper.h  | 16 ++--
 .../framework/contiguous/ContiguousOpt.cpp    |  6 +-
 .../csrc/framework/contiguous/ReshapeOpt.cpp  | 10 +--
 .../framework/contiguous/combined_opt.cpp     | 28 +++----
 .../framework/contiguous/indexing_opt.cpp     | 15 ++--
 .../csrc/framework/contiguous/permute_opt.cpp | 14 ++--
 .../framework/contiguous/reshapeV2_opt.cpp    |  6 +-
 .../csrc/framework/contiguous/reshape_opt.cpp |  2 +-
 .../csrc/framework/contiguous/select_opt.cpp  |  7 +-
 .../csrc/framework/contiguous/slice_opt.cpp   |  8 +-
 .../csrc/framework/contiguous/unfold_opt.cpp  |  7 +-
 32 files changed, 211 insertions(+), 162 deletions(-)

diff --git a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp
index a283c31d63..3525be440a 100644
--- a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp
@@ -68,8 +68,8 @@ void copy_kernel_npu(
   at::Tensor attrTensor = CalcuOpUtil::copy_tensor_host_to_device(
       at::from_blob(value.data(), {value.size()}, dtype(at::ScalarType::Long)));
 
-  auto src_desc_bp = src.storage().get_npu_desc();
-  auto self_desc_bp = self.storage().get_npu_desc();
+  auto* src_desc_bp =  torch_npu::NPUTensorImpl::GetStorageInfo(src);
+  auto* self_desc_bp = torch_npu::NPUTensorImpl::GetStorageInfo(self);
 
   // The action of PTcopy_ is defined by attrTensor, so the member of NPUStorageDesc
   // can not affect the result, but the PTcopy_ will check base_size and storage_size,
@@ -83,8 +83,8 @@ void copy_kernel_npu(
 
   CalcuOpUtil::execute_npu_operate("PTcopy_", inputs, outputs, {});
 
-  StorageDescHelper::CopyDesc(src, src_desc_bp);
-  StorageDescHelper::CopyDesc(self, self_desc_bp);
+  StorageDescHelper::CopyDesc(src, *src_desc_bp);
+  StorageDescHelper::CopyDesc(self, *self_desc_bp);
 }
 
 // the dst and src are same dtype
diff --git a/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp b/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp
index d39c5032e0..933701445c 100644
--- a/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp
@@ -38,27 +38,27 @@ at::Tensor& NPUNativeFunctions::copy_memory_(at::Tensor& self, const at::Tensor&
   AT_ASSERT(
       src.device().index() == self.device().index(),
       "input tensors of copy_memory_ should have same device index");
-  auto dst_desc = self.storage().unsafeGetStorageImpl()->npu_desc_;
-  auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
+  auto* dst_desc = torch_npu::NPUTensorImpl::GetStorageInfo(self);
+  auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
 
   int dst_size = 0;
   int src_size = 0;
 
   if (FormatHelper::IsPadded(&self)) {
     AT_ASSERT(self.storage_offset() == 0);
-    dst_size = at::prod_intlist(dst_desc.storage_sizes_);
+    dst_size = at::prod_intlist(dst_desc->storage_sizes_);
   } else {
     auto dst_element = at::prod_intlist(self.sizes());
-    auto dst_storage = at::prod_intlist(dst_desc.storage_sizes_);
+    auto dst_storage = at::prod_intlist(dst_desc->storage_sizes_);
     dst_size = (dst_element > dst_storage) ? dst_storage : dst_element;
   }
 
   if (FormatHelper::IsPadded(&src)) {
     AT_ASSERT(src.storage_offset() == 0);
-    src_size = at::prod_intlist(src_desc.storage_sizes_);
+    src_size = at::prod_intlist(src_desc->storage_sizes_);
   } else {
     auto src_element = at::prod_intlist(src.sizes());
-    auto src_storage = at::prod_intlist(src_desc.storage_sizes_);
+    auto src_storage = at::prod_intlist(src_desc->storage_sizes_);
     src_size = (src_element > src_storage) ? src_storage : src_element;
   }
 
diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
index aa98978e8a..52e180146f 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
@@ -13,16 +13,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/aten/common/FormatCastHelper.h"
+#include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu {
 namespace native {
 
 bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor& dst) {
-  auto src_format = src.storage().get_npu_desc().npu_format_;
-  auto dst_format = dst.storage().get_npu_desc().npu_format_;
+  auto src_format = torch_npu::NPUTensorImpl::GetStorageInfo(src)->npu_format_;
+  auto dst_format = torch_npu::NPUTensorImpl::GetStorageInfo(dst)->npu_format_;
   return FormatHelper::GetBaseFormat(src_format) == FormatHelper::GetBaseFormat(dst_format);
 }
 
@@ -35,12 +36,12 @@ void FormatCastHelper::format_cast_as_base_format(const at::Tensor& src, aclForm
   AT_ASSERT(FormatHelper::IsBaseFormatType(format), "dst format must be base format");
   AT_ASSERT(FormatHelper::IsBaseFormatType(src), "src format must be base format");
 
-  auto& src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
+  auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
   // due to CANN principle : if the ori format of a tensor is the
   // same as the npu format, then its base shape must be same as storage shape
   // so we should not change the storage shape when format cast between base format
-  src_desc.origin_format_ = format;
-  src_desc.npu_format_ = format;
+  src_desc->origin_format_ = format;
+  src_desc->npu_format_ = format;
   return;
 }
 
diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
index 2139cfdb4a..2be6a058e4 100644
--- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
@@ -45,9 +45,9 @@ at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src) {
 
 // convert src from src_format to dst_format, write the result into dst
 at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& dst, const at::Tensor& src) {
-  c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-  c10::NPUStorageDesc dst_desc = dst.storage().unsafeGetStorageImpl()->npu_desc_;
-  if (src_desc.npu_format_ == dst_desc.npu_format_) {
+  auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
+  auto* dst_desc = torch_npu::NPUTensorImpl::GetStorageInfo(dst);
+  if (src_desc->npu_format_ == dst_desc->npu_format_) {
     dst.copy_(src);
     return dst;
   }
@@ -62,8 +62,9 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& dst, const at::Tens
 at::Tensor NPUNativeFunctions::npu_format_cast(
     const at::Tensor& src,
     int64_t acl_format) {
-  c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-  if (src_desc.npu_format_ == acl_format) {
+
+  auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
+  if (src_desc->npu_format_ == acl_format) {
     NPU_LOGD("no need to do format cast");
     return src;
   }
@@ -77,7 +78,7 @@ at::Tensor NPUNativeFunctions::npu_format_cast(
       "can not cast format when src is not float32 or float16");
 
   at::Tensor dst = at::empty_with_format(
-      src_desc.base_sizes_, src.options(), acl_format);
+      src_desc->base_sizes_, src.options(), acl_format);
 
   // calculate the output result of the NPU
   format_cast_impl_out_npu(dst, src);
@@ -92,8 +93,8 @@ at::Tensor NPUNativeFunctions::npu_format_cast(
 at::Tensor& NPUNativeFunctions::npu_format_cast_(
     at::Tensor& src,
     int64_t acl_format) {
-  c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-  if (src_desc.npu_format_ == acl_format) {
+  auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
+  if (src_desc->npu_format_ == acl_format) {
     return src;
   }
   if (FormatHelper::IsBaseFormatType(src) &&
@@ -106,7 +107,7 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(
       "can not cast format when src is not float32 or float16");
 
   at::Tensor dst = at::empty_with_format(
-      src_desc.base_sizes_, src.options(), acl_format);
+      src_desc->base_sizes_, src.options(), acl_format);
 
   // calculate the output result of the NPU
   format_cast_impl_out_npu(dst, src);
diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
index 8d006e1055..91c3ea0c16 100644
--- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
+++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
@@ -17,9 +17,8 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <c10/npu/NPUStream.h>
-
-#include "third_party/acl/inc/acl/acl_base.h"
-#include "third_party/acl/inc/acl/acl_rt.h"
+#include <third_party/acl/inc/acl/acl_base.h>
+#include <third_party/acl/inc/acl/acl_rt.h>
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace at_npu {
diff --git a/torch_npu/csrc/aten/common/ResizeNpu.h b/torch_npu/csrc/aten/common/ResizeNpu.h
index 2f24a77e7b..2b8fed8e42 100644
--- a/torch_npu/csrc/aten/common/ResizeNpu.h
+++ b/torch_npu/csrc/aten/common/ResizeNpu.h
@@ -42,7 +42,9 @@ static void storage_resize_npu(
   at::DataPtr old_data = storage.set_data_ptr(std::move(new_data));
   ptrdiff_t old_size = storage.nbytes();
   storage.set_nbytes(size);
-  StorageDescHelper::UpdateDesc(storage.npu_desc_, new_size);
+
+  auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(storage);
+  StorageDescHelper::UpdateDesc(*src_desc, new_size);
 
   if (old_data != nullptr) {
     ptrdiff_t copy_size = old_size;
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index 3d95b195a1..2960e528f6 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -228,8 +228,7 @@ namespace at_npu
         }
         else
         {
-          auto npu_format =
-              self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_;
+          auto npu_format = torch_npu::NPUTensorImpl::GetStorageInfo(self)->npu_format_;
           result = at::empty_with_format(self.sizes(), self.options(), npu_format);
         }
       }
@@ -656,9 +655,9 @@ namespace at_npu
 
     at::Tensor NPUNativeFunctions::clone(const at::Tensor &src, c10::optional<c10::MemoryFormat> format)
     {
-      auto desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
+      auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
       auto formatSelf = OpPreparation::ApplyTensorWithFormat(
-          src.sizes(), src.options(), desc.npu_format_);
+          src.sizes(), src.options(), desc->npu_format_);
       if (try_to_optimize_copy_with_any_format(formatSelf, src))
       {
         return formatSelf;
diff --git a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
index 363599e38a..cee30addb9 100644
--- a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
@@ -21,6 +21,7 @@
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -169,9 +170,9 @@ namespace at_npu
       }
       else
       {
-        c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
+        auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
         at::Tensor src_new = at::empty_with_format(
-            src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
+            src_desc->base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
         src_new.set_(
             src.storage(),
             src_new.storage_offset(),
diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
index e836bee17c..944bdea0f7 100644
--- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
@@ -20,6 +20,7 @@
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -45,7 +46,7 @@ Return:
         return false;
       }
       int64_t numel = 1;
-      auto storageSize = tensor.storage().get_npu_desc().storage_sizes_;
+      auto storageSize = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->storage_sizes_;
 
       for (int i = 0; i < storageSize.size(); i++)
       {
@@ -71,7 +72,7 @@ Return:
         const at::Tensor &tensor,
         bool is_transpose_flex)
     {
-      auto base_sizes = tensor.storage().get_npu_desc().base_sizes_;
+      auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_;
       if (is_transpose_flex && base_sizes.size() == tensor.dim() &&
           tensor.size(-1) == base_sizes[tensor.dim() - 2] &&
           tensor.size(-2) == base_sizes[tensor.dim() - 1])
@@ -95,8 +96,8 @@ Return:
     {
       at::Tensor contiguousResult = result.is_contiguous() ? result : result.contiguous();
 
-      c10::NPUStorageDesc self_desc = self.storage().get_npu_desc();
-      c10::NPUStorageDesc mat2_desc = mat2.storage().get_npu_desc();
+      auto* self_desc = torch_npu::NPUTensorImpl::GetStorageInfo(self);
+      auto* mat2_desc = torch_npu::NPUTensorImpl::GetStorageInfo(mat2);
       bool isSelfT_flex = is_transpose_last_two_dims_flex(self);
       bool isMat2T_flex = is_transpose_last_two_dims_flex(mat2);
       bool isSelfT_strict = is_transpose_last_two_dims_strict(self, isSelfT_flex);
@@ -159,11 +160,11 @@ Return:
       // set_transposed_npu_desc
       if (isSelfT_flex && (!isSelfT_strict))
       {
-        self.storage().unsafeGetStorageImpl()->npu_desc_ = self_desc;
+        *torch_npu::NPUTensorImpl::GetStorageInfo(self) = *self_desc;
       }
       if (isMat2T_flex && (!isMat2T_strict))
       {
-        mat2.storage().unsafeGetStorageImpl()->npu_desc_ = mat2_desc;
+        *torch_npu::NPUTensorImpl::GetStorageInfo(mat2) = *mat2_desc;
       }
 
       if (!result.is_contiguous())
diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
index 537bcc2444..900c664fc5 100644
--- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
@@ -66,9 +66,9 @@ namespace at_npu
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // use 5HD in Relu
-      if ((grad_output.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ ==
+  if ((torch_npu::NPUTensorImpl::GetStorageInfo(grad_output)->npu_format_ ==
            ACL_FORMAT_NCHW) &&
-          (self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ ==
+      (torch_npu::NPUTensorImpl::GetStorageInfo(self)->npu_format_ ==
            ACL_FORMAT_NC1HWC0))
       {
         at::Tensor grad_output_5HD =
diff --git a/torch_npu/csrc/core/tensor_impl.cpp b/torch_npu/csrc/core/tensor_impl.cpp
index f9cfa14ff1..d7f670534b 100644
--- a/torch_npu/csrc/core/tensor_impl.cpp
+++ b/torch_npu/csrc/core/tensor_impl.cpp
@@ -83,4 +83,16 @@ namespace torch_npu
     return impl;
   }
 
+c10::npu::NPUCachingAllocator::NPUStorageInfo* NPUTensorImpl::GetStorageInfo(const at::Tensor& src) {
+  return c10::npu::NPUCachingAllocator::getStorageInfo(src.storage().unsafeGetStorageImpl()->data());
+}
+
+c10::npu::NPUCachingAllocator::NPUStorageInfo* NPUTensorImpl::GetStorageInfo(at::Tensor& src) {
+  return c10::npu::NPUCachingAllocator::getStorageInfo(src.storage().unsafeGetStorageImpl()->data());
+}
+
+c10::npu::NPUCachingAllocator::NPUStorageInfo* NPUTensorImpl::GetStorageInfo(at::StorageImpl& src) {
+  return c10::npu::NPUCachingAllocator::getStorageInfo(src.data());
+}
+
 }
diff --git a/torch_npu/csrc/core/tensor_impl.h b/torch_npu/csrc/core/tensor_impl.h
index f37dcabf6e..10c9e6f0e1 100644
--- a/torch_npu/csrc/core/tensor_impl.h
+++ b/torch_npu/csrc/core/tensor_impl.h
@@ -19,6 +19,7 @@
 
 #include <ATen/Tensor.h>
 #include <c10/core/TensorImpl.h>
+#include "c10/npu/NPUCachingAllocator.h"
 
 namespace torch_npu {
 
@@ -45,6 +46,11 @@ public:
       c10::VariableVersion&& version_counter,
       bool allow_tensor_metadata_change) const final;
 
+
+  static c10::npu::NPUCachingAllocator::NPUStorageInfo* GetStorageInfo(at::Tensor& src);
+  static c10::npu::NPUCachingAllocator::NPUStorageInfo* GetStorageInfo(const at::Tensor& src);
+  static c10::npu::NPUCachingAllocator::NPUStorageInfo* GetStorageInfo(at::StorageImpl& src);
+
 public:
   NPUTensorImpl(const NPUTensorImpl&) = delete;
   NPUTensorImpl& operator=(const NPUTensorImpl&) = delete;
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 951ba59607..e0d6521bd6 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -33,6 +33,7 @@
 #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "torch_npu/csrc/distributed/Init.h"
 #include "torch_npu/csrc/distributed/reducer.hpp"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 
 namespace torch_npu {
@@ -48,7 +49,7 @@ class BroadcastWork {
 public:
   inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors) {
     static auto cast_back_to_ori_format = [](const at::Tensor &t) { 
-      return t.npu_format_cast(t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); 
+      return t.npu_format_cast(torch_npu::NPUTensorImpl::GetStorageInfo(t)->origin_format_);
       };
     return c10::fmap(tensors, cast_back_to_ori_format);
   }
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 77c68f31ff..f508b7102c 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -27,6 +27,7 @@
 #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "third_party/acl/inc/acl/acl.h"
 #include "third_party/acl/inc/acl/acl_base.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace c10d_npu {
 namespace {
@@ -54,7 +55,7 @@ std::map<at::ScalarType, HcclDataType> hcclDataType = {
 };
 
 int64_t physical_numel(at::Tensor self){
-  auto sizes = self.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_;
+  auto sizes = torch_npu::NPUTensorImpl::GetStorageInfo(self)->storage_sizes_;
   int64_t n = 1;
   for (auto s : sizes) {
     n *= s;
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index b31bd008d0..7cc571d677 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -29,7 +29,7 @@
 #include <torch/csrc/autograd/utils/grad_layout_contract.h>
 #include <torch/csrc/autograd/utils/lambda_post_hook.h>
 #include <torch/csrc/utils/memory.h>
-
+#include "torch_npu/csrc/core/tensor_impl.h"
 #include "torch_npu/csrc/distributed/reducer.hpp"
 
 namespace c10d_npu {
@@ -37,7 +37,7 @@ namespace {
 
 
 int64_t physical_numel(at::Tensor self){
-  auto sizes = self.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_;
+  auto sizes = torch_npu::NPUTensorImpl::GetStorageInfo(self)->storage_sizes_;
   int64_t n = 1;
   for (auto s : sizes) {
     n *= s;
@@ -439,10 +439,10 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) {
       // previous iterations, no copy is needed.
       if (!grad.is_alias_of(bucket_view)) {
         // make sure grad has the same format as variable
-        if (grad.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ !=
-              variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_) {
+        if (torch_npu::NPUTensorImpl::GetStorageInfo(grad)->npu_format_ !=
+              torch_npu::NPUTensorImpl::GetStorageInfo(variable)->npu_format_) {
           grad = grad.npu_format_cast(
-              variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
+              torch_npu::NPUTensorImpl::GetStorageInfo(variable)->npu_format_);
         }
         this->copy_grad_to_bucket(grad, bucket_view);
         if (gradient_as_bucket_view_) {
@@ -1074,7 +1074,7 @@ void Reducer::copy_bucket_to_grad(
         // (see torch/csrc/grad/AccumulateGrad.h)
         grad = at::empty_with_format(variable.sizes(),
                                      bucket_view.options(),
-                                     variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
+                                     torch_npu::NPUTensorImpl::GetStorageInfo(variable)->npu_format_);
         grad.copy_memory_(bucket_view, true);
       } else {
         grad.copy_memory_(bucket_view, true);
diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp
index 484039ac0d..96abacbddf 100644
--- a/torch_npu/csrc/framework/FormatHelper.cpp
+++ b/torch_npu/csrc/framework/FormatHelper.cpp
@@ -63,7 +63,7 @@ namespace at_npu
 
     bool FormatHelper::IsPadded(const at::Tensor *tensor)
     {
-      auto format = tensor->storage().unsafeGetStorageImpl()->npu_desc_.npu_format_;
+      auto format = torch_npu::NPUTensorImpl::GetStorageInfo(*tensor)->npu_format_;
       return IsPadded(format);
     }
 
@@ -91,7 +91,7 @@ namespace at_npu
 
     char *FormatHelper::GetFormatName(const at::Tensor &tensor)
     {
-      auto format = tensor.storage().get_npu_desc().npu_format_;
+      auto format = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->npu_format_;
       return GetFormatName(format);
     }
 
@@ -114,7 +114,7 @@ namespace at_npu
 
     aclFormat FormatHelper::GetFormat(const at::Tensor &tensor)
     {
-      return tensor.storage().get_npu_desc().npu_format_;
+      return torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->npu_format_;
     }
 
     bool FormatHelper::IsBaseFormatType(aclFormat format)
@@ -124,11 +124,11 @@ namespace at_npu
 
     bool FormatHelper::IsBaseFormatType(const at::Tensor &tensor)
     {
-      auto format = tensor.storage().get_npu_desc().npu_format_;
+      auto format = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->npu_format_;
       return IsBaseFormatType(format);
     }
 
-    FormatShape FormatHelper::GetStorageSizes(c10::NPUStorageDesc desc)
+    FormatShape FormatHelper::GetStorageSizes(c10::npu::NPUCachingAllocator::NPUStorageInfo desc)
     {
       auto ori_size = desc.base_sizes_;
       auto format = desc.npu_format_;
diff --git a/torch_npu/csrc/framework/FormatHelper.h b/torch_npu/csrc/framework/FormatHelper.h
index 2070eafe00..3ea056be10 100644
--- a/torch_npu/csrc/framework/FormatHelper.h
+++ b/torch_npu/csrc/framework/FormatHelper.h
@@ -20,6 +20,8 @@
 #include <unordered_map>
 
 #include "torch_npu/csrc/framework/utils/NPUDefinition.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
+#include "c10/npu/NPUCachingAllocator.h"
 
 namespace at_npu
 {
@@ -50,7 +52,7 @@ namespace at_npu
       template <typename sizeType>
       static FormatShape GetStorageSizes(aclFormat format, sizeType ori_size);
       // GetStorageSizes used to calculate the storage sizes of op at npu device at different format.
-      static FormatShape GetStorageSizes(c10::NPUStorageDesc desc);
+      static FormatShape GetStorageSizes(c10::npu::NPUCachingAllocator::NPUStorageInfo desc);
 
     private:
       static bool IsPadded(aclFormat format);
diff --git a/torch_npu/csrc/framework/InferFormat.cpp b/torch_npu/csrc/framework/InferFormat.cpp
index 6ef3c77742..74f3fc6355 100644
--- a/torch_npu/csrc/framework/InferFormat.cpp
+++ b/torch_npu/csrc/framework/InferFormat.cpp
@@ -25,16 +25,16 @@ namespace at_npu
 
     aclFormat InferFormat::GuessFormatWhenContiguous(const at::Tensor &tensor)
     {
-      auto desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_;
+      auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor);
       // fix: NCDHW -> default format
-      if ((desc.origin_format_ == ACL_FORMAT_NCDHW))
+      if ((desc->origin_format_ == ACL_FORMAT_NCDHW))
       {
-        if ((tensor.sizes().size() != desc.base_sizes_.size()) && (tensor.sizes().size() <= 4))
+        if ((tensor.sizes().size() != desc->base_sizes_.size()) && (tensor.sizes().size() <= 4))
         {
           return ACL_FORMAT_NCHW;
         }
       }
-      return desc.origin_format_;
+      return desc->origin_format_;
     }
 
     // NOTE: this method should cooperate with shape infer.
@@ -111,7 +111,7 @@ namespace at_npu
     FormatShape InferFormat::GuessStorageSizeWhenConvertFormat(const at::Tensor &tensor)
     {
       auto format = FormatHelper::GetFormat(tensor);
-      auto size = tensor.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_;
+      auto size = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_;
       // TransData: ND->NZ, ND size < 2, we can expand dimension to 2, the storage have no effect.
       // now, only ND->NZ and NZ->ND will call transdata， so we no need to check other format.
       if ((size.size() < 2) && format == ACL_FORMAT_ND)
diff --git a/torch_npu/csrc/framework/OpCmdHelper.cpp b/torch_npu/csrc/framework/OpCmdHelper.cpp
index 4c926bef26..28e0c9672a 100644
--- a/torch_npu/csrc/framework/OpCmdHelper.cpp
+++ b/torch_npu/csrc/framework/OpCmdHelper.cpp
@@ -33,11 +33,11 @@ namespace at_npu
       at::ScalarType scalarDataType = tensor.scalar_type();
       aclDataType aclDataType =
           CalcuOpUtil::convert_to_acl_data_type(scalarDataType, forceDataType);
-      const auto &npuDesc = tensor.storage().get_npu_desc();
-      auto &storageDims = npuDesc.storage_sizes_;
+      const auto* npuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor);
+      auto &storageDims = npuDesc->storage_sizes_;
       AclTensorDescMaker desc;
-      auto aclDesc = desc.Create(aclDataType, npuDesc)
-                         .SetFormat(npuDesc.npu_format_)
+      auto aclDesc = desc.Create(aclDataType, *npuDesc)
+                         .SetFormat(npuDesc->npu_format_)
                          .SetShape(storageDims)
                          .SetName(descName)
                          .SetConstAttr(cpu_tensor)
@@ -47,7 +47,7 @@ namespace at_npu
       AclTensorBufferMaker buffer(tensor, numel);
       auto aclBuff = buffer.Get();
       int64_t storageDim = storageDims.size();
-      return std::tie(aclDesc, aclBuff, storageDim, npuDesc.npu_format_);
+      return std::tie(aclDesc, aclBuff, storageDim, npuDesc->npu_format_);
     }
 
     std::tuple<aclTensorDesc *, aclDataBuffer *, int64_t, aclFormat> OpCmdHelper::CovertTensorWithZeroDimToAclInput(
@@ -127,19 +127,19 @@ namespace at_npu
     {
       aclDataType aclDataType = CalcuOpUtil::convert_to_acl_data_type(
           tensorPtr->scalar_type(), forceDataType);
-      const auto &npuDesc = tensorPtr->storage().get_npu_desc();
+      const auto* npuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(*tensorPtr);
       const auto &dims = tensorPtr->sizes();
-      auto &storageDims = npuDesc.storage_sizes_;
+      auto &storageDims = npuDesc->storage_sizes_;
       AclTensorDescMaker desc;
-      auto aclDesc = desc.Create(aclDataType, dims, npuDesc.origin_format_)
-                         .SetFormat(npuDesc.npu_format_)
+      auto aclDesc = desc.Create(aclDataType, dims, npuDesc->origin_format_)
+                         .SetFormat(npuDesc->npu_format_)
                          .SetShape(storageDims)
                          .Get();
       auto numel = at::prod_intlist(storageDims);
       AclTensorBufferMaker aclBuffer(tensorPtr, numel);
       auto aclBuff = aclBuffer.Get();
       int64_t storageDim = storageDims.size();
-      return std::tie(aclDesc, aclBuff, storageDim, npuDesc.npu_format_);
+      return std::tie(aclDesc, aclBuff, storageDim, npuDesc->npu_format_);
     }
 
     std::tuple<aclTensorDesc *, aclDataBuffer *, int64_t, aclFormat> OpCmdHelper::CovertTransDataTensorToAcl(
diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h
index 89679fd6d9..7163626aaf 100644
--- a/torch_npu/csrc/framework/OpParamMaker.h
+++ b/torch_npu/csrc/framework/OpParamMaker.h
@@ -22,6 +22,8 @@
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
 #include "torch_npu/csrc/framework/NPUDefine.h"
 #include "torch_npu/csrc/framework/interface/Graph.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
+#include "c10/npu/NPUCachingAllocator.h"
 
 namespace at_npu
 {
@@ -66,7 +68,7 @@ namespace at_npu
       AclTensorDescMaker() {}
       ~AclTensorDescMaker() = default;
 
-      AclTensorDescMaker &Create(aclDataType dataType, c10::NPUStorageDesc storageDesc)
+      AclTensorDescMaker& Create(aclDataType dataType, c10::npu::NPUCachingAllocator::NPUStorageInfo storageDesc)
       {
         auto dims = storageDesc.base_sizes_;
         auto format = storageDesc.origin_format_;
diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp
index c23d1d4415..a43e24d28b 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.cpp
+++ b/torch_npu/csrc/framework/StorageDescHelper.cpp
@@ -16,6 +16,7 @@
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -24,8 +25,8 @@ namespace at_npu
 
     bool StorageDescHelper::MetaDataAreMatch(const at::Tensor *tensor)
     {
-      auto &desc = tensor->storage().unsafeGetStorageImpl()->npu_desc_;
-      return IsSameSize(desc.base_sizes_, tensor->sizes()) && IsSameSize(desc.base_strides_, tensor->strides());
+      auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(*tensor);
+      return IsSameSize(desc->base_sizes_, tensor->sizes()) && IsSameSize(desc->base_strides_, tensor->strides());
     }
 
     bool StorageDescHelper::OffsetAreMatch(const at::Tensor *tensor)
@@ -34,7 +35,7 @@ namespace at_npu
     }
 
     // copy related
-    bool StorageDescHelper::IsSameDesc(const c10::NPUStorageDesc &a, const c10::NPUStorageDesc &b)
+    bool StorageDescHelper::IsSameDesc(const c10::npu::NPUCachingAllocator::NPUStorageInfo& a, const c10::npu::NPUCachingAllocator::NPUStorageInfo& b)
     {
       if ((a.origin_format_ != b.origin_format_) || (a.npu_format_ != b.npu_format_))
       {
@@ -48,9 +49,9 @@ namespace at_npu
 
     bool StorageDescHelper::IsSameDesc(const at::Tensor &a, const at::Tensor &b)
     {
-      auto descA = a.storage().unsafeGetStorageImpl()->npu_desc_;
-      auto descB = b.storage().unsafeGetStorageImpl()->npu_desc_;
-      return IsSameDesc(descA, descB);
+      auto* descA = torch_npu::NPUTensorImpl::GetStorageInfo(a);
+      auto* descB = torch_npu::NPUTensorImpl::GetStorageInfo(b);;
+      return IsSameDesc(*descA, *descB);
     }
 
     bool StorageDescHelper::IsSameSize(c10::SmallVector<int64_t, 5> a, c10::IntArrayRef b)
@@ -62,7 +63,7 @@ namespace at_npu
       return false;
     }
 
-    void StorageDescHelper::UpdateDesc(c10::NPUStorageDesc &npuDesc, c10::IntArrayRef &new_size)
+    void StorageDescHelper::UpdateDesc(c10::npu::NPUCachingAllocator::NPUStorageInfo& npuDesc, c10::IntArrayRef& new_size)
     {
       npuDesc.base_sizes_ = new_size;
 
@@ -98,17 +99,28 @@ namespace at_npu
 
     void StorageDescHelper::SetDesc(at::Tensor &dst)
     {
-      dst.storage().unsafeGetStorageImpl()->npu_desc_ = SetDesc();
+      if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) {
+        return;
+      }
+      *torch_npu::NPUTensorImpl::GetStorageInfo(dst) = SetDesc();
     }
 
-    void StorageDescHelper::SetDesc(at::Tensor &dst, c10::IntArrayRef size, c10::IntArrayRef strides)
-    {
-      dst.storage().unsafeGetStorageImpl()->npu_desc_ = SetDesc(size, strides);
+    void StorageDescHelper::SetDesc(at::Tensor& dst, c10::IntArrayRef size, c10::IntArrayRef strides)
+	{
+      if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) {
+        return;
+      }
+
+      *torch_npu::NPUTensorImpl::GetStorageInfo(dst) = SetDesc(size, strides);
     }
 
     void StorageDescHelper::SetDesc(at::Tensor &dst, c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format)
     {
-      dst.storage().unsafeGetStorageImpl()->npu_desc_ = SetDesc(size, strides, format);
+	  if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) {
+        return;
+      }
+
+      *torch_npu::NPUTensorImpl::GetStorageInfo(dst) = SetDesc(size, strides, format);
     }
 
     void StorageDescHelper::CopyDesc(at::Tensor &dst, const at::Tensor &src)
@@ -118,36 +130,42 @@ namespace at_npu
 
     void StorageDescHelper::CopyDesc(at::Tensor &dst, const c10::Storage &src)
     {
-      CopyDesc(dst, src.unsafeGetStorageImpl()->npu_desc_);
+      CopyDesc(dst, *torch_npu::NPUTensorImpl::GetStorageInfo(*src.unsafeGetStorageImpl()));
     }
 
-    void StorageDescHelper::CopyDesc(const at::Tensor &dst, const c10::NPUStorageDesc &src_desc)
-    {
-      auto &dstDesc = dst.storage().unsafeGetStorageImpl()->npu_desc_;
-      dstDesc = src_desc;
+    void StorageDescHelper::CopyDesc(const at::Tensor& dst, const c10::npu::NPUCachingAllocator::NPUStorageInfo& src_desc) {
+      if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) {
+        return;
+      }
+
+      auto* dstDesc = torch_npu::NPUTensorImpl::GetStorageInfo(dst);
+      *dstDesc = src_desc;
     }
 
-    void StorageDescHelper::ReflushDescBySelf(const at::Tensor &src)
-    {
-      auto &desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-      desc.base_sizes_ = src.sizes();
-      desc.storage_sizes_ = src.sizes();
-      desc.base_strides_ = src.strides();
+    void StorageDescHelper::ReflushDescBySelf(const at::Tensor& src)
+	{
+      if (src.storage().unsafeGetStorageImpl()->data() == nullptr) {
+        return;
+      }
+      auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
+      desc->base_sizes_ = src.sizes();
+      desc->storage_sizes_ = src.sizes();
+      desc->base_strides_ = src.strides();
     }
 
-    c10::NPUStorageDesc StorageDescHelper::SetDesc()
+    c10::npu::NPUCachingAllocator::NPUStorageInfo StorageDescHelper::SetDesc()
     {
       return SetDesc({0}, {});
     }
 
-    c10::NPUStorageDesc StorageDescHelper::SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides)
-    {
+    c10::npu::NPUCachingAllocator::NPUStorageInfo StorageDescHelper::SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides)
+	  {
       return SetDesc(size, strides, InferFormat::GuessBaseFormat(size));
     }
 
-    c10::NPUStorageDesc StorageDescHelper::SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format)
+    c10::npu::NPUCachingAllocator::NPUStorageInfo StorageDescHelper::SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format)
     {
-      struct c10::NPUStorageDesc npu_desc;
+      struct c10::npu::NPUCachingAllocator::NPUStorageInfo::NPUStorageInfo npu_desc;
       npu_desc.base_sizes_ = size;
       npu_desc.base_strides_ = strides;
       // guess ori format and npu format unit by size and dst format
@@ -162,7 +180,7 @@ namespace at_npu
       return npu_desc;
     }
 
-    int64_t StorageDescHelper::GetMemorySize(const c10::NPUStorageDesc &desc)
+    int64_t StorageDescHelper::GetMemorySize(const c10::npu::NPUCachingAllocator::NPUStorageInfo& desc)
     {
       auto physical_size = FormatHelper::GetStorageSizes(desc);
       return at::prod_intlist(physical_size);
@@ -170,8 +188,8 @@ namespace at_npu
 
     int64_t StorageDescHelper::GetMemorySize(const at::Tensor &dst)
     {
-      auto desc = dst.storage().unsafeGetStorageImpl()->npu_desc_;
-      return GetMemorySize(desc);
+      auto* desc = torch_npu::NPUTensorImpl::GetStorageInfo(dst);
+      return GetMemorySize(*desc);
     }
 
     int64_t StorageDescHelper::GetMemorySize(c10::IntArrayRef size, aclFormat format)
diff --git a/torch_npu/csrc/framework/StorageDescHelper.h b/torch_npu/csrc/framework/StorageDescHelper.h
index 02e416acc2..1765f36143 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.h
+++ b/torch_npu/csrc/framework/StorageDescHelper.h
@@ -19,7 +19,7 @@
 #include <ATen/ATen.h>
 
 #include "torch_npu/csrc/framework/utils/NPUDefinition.h"
-
+#include "torch_npu/csrc/core/tensor_impl.h"
 namespace at_npu
 {
   namespace native
@@ -35,7 +35,7 @@ namespace at_npu
       static bool OffsetAreMatch(const at::Tensor *tensor);
 
       // helper function of transdata op.
-      static bool IsSameDesc(const c10::NPUStorageDesc &a, const c10::NPUStorageDesc &b);
+      static bool IsSameDesc(const c10::npu::NPUCachingAllocator::NPUStorageInfo& a, const c10::npu::NPUCachingAllocator::NPUStorageInfo& b);
       static bool IsSameDesc(const at::Tensor &a, const at::Tensor &b);
 
       // calculate storage size need by npu memory
@@ -52,9 +52,9 @@ namespace at_npu
 
       static void CopyDesc(at::Tensor &dst, const at::Tensor &src);
       static void CopyDesc(at::Tensor &dst, const c10::Storage &src);
-      static void CopyDesc(const at::Tensor &dst, const c10::NPUStorageDesc &src_desc);
+      static void CopyDesc(const at::Tensor& dst, const c10::npu::NPUCachingAllocator::NPUStorageInfo& src_desc);
 
-      static void UpdateDesc(c10::NPUStorageDesc &npuDesc, c10::IntArrayRef &new_size);
+      static void UpdateDesc(c10::npu::NPUCachingAllocator::NPUStorageInfo& npuDesc, c10::IntArrayRef& new_size);
 
       static FormatShape ComputeStrideFromShape(const FormatShape &shape);
 
@@ -64,11 +64,11 @@ namespace at_npu
     private:
       // Get Part
       static bool IsSameSize(c10::SmallVector<int64_t, 5> a, c10::IntArrayRef b);
-      static int64_t GetMemorySize(const c10::NPUStorageDesc &dst);
+      static int64_t GetMemorySize(const c10::npu::NPUCachingAllocator::NPUStorageInfo& dst);
       // Set Part
-      static c10::NPUStorageDesc SetDesc();
-      static c10::NPUStorageDesc SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides);
-      static c10::NPUStorageDesc SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format);
+      static c10::npu::NPUCachingAllocator::NPUStorageInfo SetDesc();
+      static c10::npu::NPUCachingAllocator::NPUStorageInfo SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides);
+      static c10::npu::NPUCachingAllocator::NPUStorageInfo SetDesc(c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format);
     };
 
   } // namespace native
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
index 0854e27c09..cfee4e6557 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
@@ -46,7 +46,7 @@ namespace at_npu
       }
 
       if (at::prod_intlist(tensor.sizes()) <
-          at::prod_intlist(tensor.storage().get_npu_desc().base_sizes_))
+          at::prod_intlist(torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_))
       {
         return {"slice", "select", "indexing"};
       }
@@ -61,7 +61,7 @@ namespace at_npu
       // 2. full memory copy: size match between src and self
       if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() &&
           src.sizes().equals(self.sizes()) &&
-          self.sizes().equals(self.storage().get_npu_desc().base_sizes_))
+          self.sizes().equals(torch_npu::NPUTensorImpl::GetStorageInfo(self)->base_sizes_))
       {
         return true;
       }
@@ -112,7 +112,7 @@ namespace at_npu
       auto self = at::empty_with_format(
           src.sizes(),
           src.options(),
-          src.storage().get_npu_desc().npu_format_);
+          torch_npu::NPUTensorImpl::GetStorageInfo(src)->npu_format_);
       if (ContiguousOptimizeWithAnyFormat(self, src, optimizations))
       {
         return self;
diff --git a/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp b/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp
index db562ee712..3607128210 100644
--- a/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp
@@ -22,7 +22,7 @@ namespace at_npu
 
     bool can_use_memecpy_for_NZ_format(const at::Tensor &tensor)
     {
-      auto base_size = tensor.storage().get_npu_desc().base_sizes_;
+      auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_;
       // Make sure that sizes of last 2 dims don't change
       if (tensor.size(-1) != base_size[base_size.size() - 1] ||
           tensor.size(-2) != base_size[base_size.size() - 2])
@@ -39,8 +39,8 @@ namespace at_npu
       {
         return false;
       }
-      auto srcNpuDesc = src.storage().get_npu_desc();
-      switch (srcNpuDesc.npu_format_)
+      auto* srcNpuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
+      switch(srcNpuDesc->npu_format_)
       {
       case ACL_FORMAT_FRACTAL_NZ:
         return can_use_memecpy_for_NZ_format(src);
@@ -48,12 +48,12 @@ namespace at_npu
       default:
         // For other format, make sure that copy the whole memory.
         // Moreover, storage size expanding caused by padding could be avoided
-        if (!(srcNpuDesc.base_sizes_ == array_to_small_vector(src.sizes())))
+        if (!(srcNpuDesc->base_sizes_ == array_to_small_vector(src.sizes())))
         {
           return false;
         }
         // Make sure no pandding happens
-        if (src.numel() != at::prod_intlist(srcNpuDesc.storage_sizes_))
+        if (src.numel() != at::prod_intlist(srcNpuDesc->storage_sizes_))
         {
           return false;
         }
diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
index cf270817f8..cc9ae17e2f 100644
--- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
@@ -18,6 +18,7 @@
 #include <ATen/NamedTensorUtils.h>
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -50,10 +51,10 @@ namespace at_npu
         {
           RECORD_FUNCTION("npuCombined", std::vector<c10::IValue>({src}));
           // Record src infos for recovering after trans-contiguous
-          const auto &src_npu_desc = src.storage().get_npu_desc();
+          const auto* src_npu_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
 
           // Construct base tensor(contiguous)
-          at::Tensor base_tensor = at::empty(src_npu_desc.base_sizes_, src.options());
+          at::Tensor base_tensor = at::empty(src_npu_desc->base_sizes_, src.options());
           base_tensor.set_(src.storage());
 
           // Reconstruct combined discontiguous tensor ==trans==> contiguous tensor
@@ -61,7 +62,7 @@ namespace at_npu
               combined_to_contiguous(base_tensor, self, viewInfos, viewOffsets);
 
           // Recover modified tensor infos of src after trans-contiguous
-          StorageDescHelper::CopyDesc(base_tensor, src_npu_desc);
+          StorageDescHelper::CopyDesc(base_tensor, *src_npu_desc);
           return contiguousOrNot;
         }
         return false;
@@ -88,10 +89,10 @@ namespace at_npu
         {
           return false;
         }
-        auto npu_desc = tensor.storage().get_npu_desc();
+        auto* npu_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor);
 
-        if ((at::prod_intlist(tensor.sizes()) != at::prod_intlist(npu_desc.base_sizes_)) ||
-            (tensor.storage_offset() != npu_desc.base_offset_))
+        if ((at::prod_intlist(tensor.sizes()) != at::prod_intlist(npu_desc->base_sizes_)) ||
+            (tensor.storage_offset() != npu_desc->base_offset_))
         {
           return false;
         }
@@ -153,7 +154,7 @@ namespace at_npu
               return false;
             }
             // Avoid combined-cases such as squeeze+indexing at the first axis.
-            if (tensor.strides()[0] != tensor.storage().get_npu_desc().base_strides_[0])
+            if(tensor.strides()[0] != torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_strides_[0])
             {
               return false;
             }
@@ -166,7 +167,7 @@ namespace at_npu
       {
         // tensors with reduced numel will be taken into consideration.
         if (at::prod_intlist(tensor.sizes()) <
-            at::prod_intlist(tensor.storage().get_npu_desc().base_sizes_))
+            at::prod_intlist(torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->base_sizes_))
         {
           for (auto i = 0; i < tensor.sizes().size() - 2; i++)
           {
@@ -192,8 +193,8 @@ namespace at_npu
           FormatShape &infer_stride,
           int64_t &infer_offset)
       {
-        auto base_sizes = src.storage().get_npu_desc().base_sizes_;
-        auto base_strides = src.storage().get_npu_desc().base_strides_;
+        auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
+        auto base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_;
         auto view_sizes = array_to_small_vector(src.sizes());
         auto view_strides = array_to_small_vector(src.strides());
 
@@ -343,7 +344,6 @@ namespace at_npu
         {
           return false;
         }
-        auto tensor_desc = tensor.storage().get_npu_desc();
         c10::SmallVector<FormatShape, 2> view_info_part;
         view_info_part.emplace_back(array_to_small_vector(tensor.sizes()));
         view_info_part.emplace_back(array_to_small_vector(tensor.strides()));
@@ -366,8 +366,8 @@ namespace at_npu
           return false;
         }
 
-        auto combined_base_sizes = src.storage().get_npu_desc().base_sizes_;
-        auto combined_base_strides = src.storage().get_npu_desc().base_strides_;
+        auto combined_base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
+        auto combined_base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_;
 
         // Key infos that should be inferred.
         FormatShape infer_size;
@@ -501,7 +501,7 @@ namespace at_npu
             auto contiguous_src = at::empty_with_format(
                 src.sizes(),
                 src.options(),
-                src.storage().get_npu_desc().npu_format_);
+                torch_npu::NPUTensorImpl::GetStorageInfo(src)->npu_format_);
             return (
                 copy_optimize_contiguous_by_given_cases(
                     src, contiguous_src, optimizations_first) &&
diff --git a/torch_npu/csrc/framework/contiguous/indexing_opt.cpp b/torch_npu/csrc/framework/contiguous/indexing_opt.cpp
index ff4dc0683a..ab7d0e3fad 100644
--- a/torch_npu/csrc/framework/contiguous/indexing_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/indexing_opt.cpp
@@ -15,6 +15,7 @@
 
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -50,20 +51,20 @@ namespace at_npu
         {
           return false;
         }
-        auto src_desc = src.storage().get_npu_desc();
-        if (src.numel() >= at::prod_intlist(src_desc.base_sizes_))
+        auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
+        if (src.numel() >= at::prod_intlist(src_desc->base_sizes_))
         {
           return false;
         }
 
-        if (src.dim() != src_desc.base_sizes_.size() ||
-            src.strides().size() != src_desc.base_strides_.size())
+        if (src.dim() != src_desc->base_sizes_.size() ||
+            src.strides().size() != src_desc->base_strides_.size())
         {
           return false;
         }
 
-        auto base_size = src.storage().get_npu_desc().base_sizes_;
-        auto base_stride = src.storage().get_npu_desc().base_strides_;
+        auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
+        auto base_stride = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_;
 
         // indexing信息获取部分
         // Get step info(for indexing step at index aixs should > 1)
@@ -140,7 +141,7 @@ namespace at_npu
                                   c10::SmallVector<int64_t, SHAPE_SIZE> &step)
       {
 
-        auto base_size = src.storage().get_npu_desc().base_sizes_;
+        auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
 
         // recover contiguous base tensor
         at::Tensor temp_src = at::empty(base_size, src.options());
diff --git a/torch_npu/csrc/framework/contiguous/permute_opt.cpp b/torch_npu/csrc/framework/contiguous/permute_opt.cpp
index 238017c21d..0506c82b46 100644
--- a/torch_npu/csrc/framework/contiguous/permute_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/permute_opt.cpp
@@ -16,6 +16,7 @@
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -37,13 +38,12 @@ namespace at_npu
           // create contiguous tensor for npu transpose
           at::Tensor temp_src = at::empty(sizes, src.options());
           temp_src.set_(src.storage(), temp_src.storage_offset(), temp_src.sizes(), temp_src.strides());
-          auto npu_desc = temp_src.storage().unsafeGetStorageImpl()->npu_desc_;
-          temp_src.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_ = temp_src.sizes();
-          temp_src.storage().unsafeGetStorageImpl()->npu_desc_.base_strides_ = temp_src.strides();
-          temp_src.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_ = temp_src.sizes();
+
+          auto npu_desc = *torch_npu::NPUTensorImpl::GetStorageInfo(temp_src);
+          StorageDescHelper::ReflushDescBySelf(temp_src);
 
           NPUNativeFunctions::npu_transpose_out(temp_src, perm, self);
-          temp_src.storage().unsafeGetStorageImpl()->npu_desc_ = npu_desc;
+          *torch_npu::NPUTensorImpl::GetStorageInfo(temp_src) = npu_desc;
           return true;
         }
         return false;
@@ -67,8 +67,8 @@ namespace at_npu
           return false;
         }
 
-        auto base_sizes = src.storage().get_npu_desc().base_sizes_;
-        auto base_strides = src.storage().get_npu_desc().base_strides_;
+        auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
+        auto base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_;
         auto view_sizes = array_to_small_vector(src.sizes());
         auto view_strides = array_to_small_vector(src.strides());
         c10::SmallVector<int64_t, SHAPE_SIZE> indexes;
diff --git a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp
index 8f2e9cd162..27c3c88b87 100644
--- a/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/reshapeV2_opt.cpp
@@ -36,7 +36,7 @@ namespace at_npu
           copy_d2d_by_memcpy(
               self,
               src,
-              at::prod_intlist(self.storage().get_npu_desc().storage_sizes_));
+              at::prod_intlist(torch_npu::NPUTensorImpl::GetStorageInfo(self)->storage_sizes_));
           return true;
         }
         return false;
@@ -100,13 +100,13 @@ namespace at_npu
 
       bool can_use_memory_repoint(const at::Tensor &tensor)
       {
-        auto tensorNpuDesc = tensor.storage().get_npu_desc();
+        auto* tensorNpuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor);
         if (FormatHelper::IsBaseFormatType(tensor))
         {
           return true;
         }
 
-        if (tensorNpuDesc.npu_format_ == ACL_FORMAT_FRACTAL_NZ)
+        if (tensorNpuDesc->npu_format_ == ACL_FORMAT_FRACTAL_NZ)
         {
           // No padding
           if ((tensor.size(-1) % 16 == 0) && (tensor.size(-2) % 16 == 0))
diff --git a/torch_npu/csrc/framework/contiguous/reshape_opt.cpp b/torch_npu/csrc/framework/contiguous/reshape_opt.cpp
index 73bc89d1f9..789254d946 100644
--- a/torch_npu/csrc/framework/contiguous/reshape_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/reshape_opt.cpp
@@ -28,7 +28,7 @@ namespace at_npu
         if (check_reshape_match(src, self))
         {
           RECORD_FUNCTION("View_d2dCopyAsync", std::vector<c10::IValue>({src}));
-          copy_d2d_by_memcpy(self, src, at::prod_intlist(self.storage().get_npu_desc().storage_sizes_));
+          copy_d2d_by_memcpy(self, src, at::prod_intlist(torch_npu::NPUTensorImpl::GetStorageInfo(self)->storage_sizes_));
           return true;
         }
         return false;
diff --git a/torch_npu/csrc/framework/contiguous/select_opt.cpp b/torch_npu/csrc/framework/contiguous/select_opt.cpp
index b662ba85cc..9767a91ea3 100644
--- a/torch_npu/csrc/framework/contiguous/select_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/select_opt.cpp
@@ -15,6 +15,7 @@
 
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -59,8 +60,8 @@ namespace at_npu
           return false;
         }
         // base info and src info
-        auto base_size = src.storage().get_npu_desc().base_sizes_;
-        auto base_stride = src.storage().get_npu_desc().base_strides_;
+        auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
+        auto base_stride = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_;
         auto select_size = src.sizes();
         auto select_stride = src.strides();
 
@@ -145,7 +146,7 @@ namespace at_npu
           c10::SmallVector<int64_t, SHAPE_SIZE> &start,
           c10::SmallVector<int64_t, SHAPE_SIZE> &length)
       {
-        auto base_size = src.storage().get_npu_desc().base_sizes_;
+        auto base_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
 
         // Recover base tensor(necessary) a = b.select(1, 1)
         at::Tensor temp_src = at::empty(base_size, src.options());
diff --git a/torch_npu/csrc/framework/contiguous/slice_opt.cpp b/torch_npu/csrc/framework/contiguous/slice_opt.cpp
index 0eeea31d2c..876dd0890f 100644
--- a/torch_npu/csrc/framework/contiguous/slice_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/slice_opt.cpp
@@ -16,7 +16,7 @@
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-
+#include "torch_npu/csrc/core/tensor_impl.h"
 namespace at_npu
 {
   namespace native
@@ -61,8 +61,8 @@ namespace at_npu
           return false;
         }
 
-        auto base_sizes = src.storage().get_npu_desc().base_sizes_;
-        auto base_strides = src.storage().get_npu_desc().base_strides_;
+        auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
+        auto base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_;
         auto view_sizes = array_to_small_vector(src.sizes());
         auto view_strides = array_to_small_vector(src.strides());
 
@@ -144,7 +144,7 @@ namespace at_npu
                                const c10::SmallVector<int64_t, SHAPE_SIZE> &size)
       {
         // create contiguous tensor for npu slice
-        auto temp_tensor_size = src.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_;
+        auto temp_tensor_size = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
         at::Tensor temp_src = at::empty(temp_tensor_size, src.options());
         temp_src.set_(src.storage(), temp_src.storage_offset(), temp_src.sizes(), temp_src.strides());
 
diff --git a/torch_npu/csrc/framework/contiguous/unfold_opt.cpp b/torch_npu/csrc/framework/contiguous/unfold_opt.cpp
index e0909846b7..02c43112f4 100644
--- a/torch_npu/csrc/framework/contiguous/unfold_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/unfold_opt.cpp
@@ -15,6 +15,7 @@
 
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -60,8 +61,8 @@ namespace at_npu
           return false;
         }
 
-        auto base_sizes = src.storage().get_npu_desc().base_sizes_;
-        auto base_strides = src.storage().get_npu_desc().base_strides_;
+        auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
+        auto base_strides = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_strides_;
         auto view_sizes = array_to_small_vector(src.sizes());
         auto view_strides = array_to_small_vector(src.strides());
 
@@ -126,7 +127,7 @@ namespace at_npu
                                 int64_t &fold_step)
       {
 
-        auto base_sizes = src.storage().get_npu_desc().base_sizes_;
+        auto base_sizes = torch_npu::NPUTensorImpl::GetStorageInfo(src)->base_sizes_;
 
         TORCH_CHECK(fold_size != 0, "size should not be 0");
         int64_t split_nums = base_sizes[fold_dimension] / fold_size;
-- 
Gitee


From 0c256cdb50dff58177eaac21866dd4da8d11d45e Mon Sep 17 00:00:00 2001
From: "zhousinan@huawei.com" <zhousinan@huawei.com>
Date: Tue, 25 Jan 2022 17:04:26 +0800
Subject: [PATCH 2/6] add torch_npu/csrc/framework/utils/CalcuOpUtil.cpp

---
 .../csrc/framework/utils/CalcuOpUtil.cpp      | 34 +++++-----
 torch_npu/csrc/framework/utils/NpuUtils.cpp   | 68 ++++++++-----------
 .../csrc/framework/utils/OpPreparation.cpp    |  9 +--
 3 files changed, 52 insertions(+), 59 deletions(-)

diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
index a847435adc..7a65518a77 100644
--- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
+++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
@@ -27,6 +27,7 @@
 #include "torch_npu/csrc/framework/utils/NpuFuzzyBlacklist.h"
 #include "torch_npu/csrc/framework/interface/EnvVariables.h"
 #include "third_party/acl/inc/acl/acl_base.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -191,8 +192,7 @@ namespace at_npu
     {
       if (NpuUtils::check_match(&tensor) || NpuUtils::check_5d_5d_match(tensor))
       {
-        auto tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_;
-        return tensor_desc.npu_format_;
+        return torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->npu_format_;;
       }
       else
       {
@@ -260,7 +260,7 @@ namespace at_npu
         return false;
       }
       int64_t numel = 1;
-      auto storageSize = tensor.storage().get_npu_desc().storage_sizes_;
+      auto storageSize = torch_npu::NPUTensorImpl::GetStorageInfo(tensor)->storage_sizes_;
 
       for (int i = 0; i < storageSize.size(); i++)
       {
@@ -270,12 +270,12 @@ namespace at_npu
       int64_t dim1 = tensor.dim() - 1;
       int64_t dim2 = tensor.dim() - 2;
 
-      auto tensor_desc = tensor.storage().get_npu_desc();
+      auto* tensor_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor);
       if (tensor.stride(dim2) == 1 && tensor.stride(dim1) == tensor.size(dim2) &&
-          tensor.size(dim1) == tensor_desc.base_sizes_[dim2] &&
-          tensor.size(dim2) == tensor_desc.base_sizes_[dim1] &&
-          tensor.numel() == numel &&
-          tensor_desc.base_sizes_.size() == tensor.dim())
+        tensor.size(dim1) == tensor_desc->base_sizes_[dim2] &&
+        tensor.size(dim2) == tensor_desc->base_sizes_[dim1] &&
+        tensor.numel() == numel &&
+        tensor_desc->base_sizes_.size() == tensor.dim())
       {
         return true;
       }
@@ -416,8 +416,8 @@ namespace at_npu
         {
           at::Tensor *aclInput = &input[i].tensor;
           c10::SmallVector<int64_t, 5> dims;
-          dims = aclInput->storage().get_npu_desc().base_sizes_;
-          auto storageDims = aclInput->storage().get_npu_desc().storage_sizes_;
+          dims = torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->base_sizes_;
+          auto storageDims = torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->storage_sizes_;
           int64_t numel = 1;
           for (int j = 0; j < storageDims.size(); j++)
           {
@@ -428,9 +428,9 @@ namespace at_npu
               aclDataType,
               dims.size(),
               dims.data(),
-              aclInput->storage().get_npu_desc().origin_format_);
+              torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->origin_format_);
           aclSetTensorFormat(
-              acl_tensor_desc, aclInput->storage().get_npu_desc().npu_format_);
+              acl_tensor_desc, torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->npu_format_);
           aclSetTensorShape(
               acl_tensor_desc, storageDims.size(), storageDims.data());
           if (input[i].tensorDescName != "")
@@ -441,7 +441,7 @@ namespace at_npu
           aclDataInputBuffArr[i] = aclCreateDataBuffer(
               (void *)(aclInput->data_ptr()), aclInput->itemsize() * numel);
           inputDimsArr[i] = storageDims.size();
-          inputFormatsArr[i] = aclInput->storage().get_npu_desc().npu_format_;
+          inputFormatsArr[i] = torch_npu::NPUTensorImpl::GetStorageInfo(*aclInput)->npu_format_;
         }
         else if (
             input[i].tensorDescType ==
@@ -486,7 +486,7 @@ namespace at_npu
             aclOutput->scalar_type(), output[i].realDataType);
 
         auto dims = aclOutput->sizes();
-        auto storageDims = aclOutput->storage().get_npu_desc().storage_sizes_;
+        auto storageDims = torch_npu::NPUTensorImpl::GetStorageInfo(*aclOutput)->storage_sizes_;
         int64_t numel = 1;
         for (int j = 0; j < storageDims.size(); j++)
         {
@@ -497,16 +497,16 @@ namespace at_npu
             aclDataType,
             dims.size(),
             dims.data(),
-            aclOutput->storage().get_npu_desc().origin_format_);
+            torch_npu::NPUTensorImpl::GetStorageInfo(*aclOutput)->origin_format_);
         aclSetTensorFormat(
-            acl_tensor_desc, aclOutput->storage().get_npu_desc().npu_format_);
+            acl_tensor_desc, torch_npu::NPUTensorImpl::GetStorageInfo(*aclOutput)->npu_format_);
         aclSetTensorShape(
             acl_tensor_desc, storageDims.size(), storageDims.data());
         aclTensorOutputDescArr[i] = acl_tensor_desc;
         aclDataOutputBuffArr[i] = aclCreateDataBuffer(
             (void *)aclOutput->data_ptr(), aclOutput->itemsize() * numel);
         outputDimsArr[i] = storageDims.size();
-        outputFormatsArr[i] = aclOutput->storage().get_npu_desc().npu_format_;
+        outputFormatsArr[i] = torch_npu::NPUTensorImpl::GetStorageInfo(*aclOutput)->npu_format_;
       }
 
       params.input_num = inputNum;
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp
index a0efe852f7..df7e363511 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.cpp
+++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp
@@ -87,15 +87,14 @@ namespace at_npu
       // (2) 4d format situation, only uncontiguous in Channel size
       // (3) size and start point must be 16*, make sure the memory be contiguous
       // std::cout<<"step in check5d5d Match."<<std::endl;
-      const c10::Storage storage = tensor.storage();
-      const c10::NPUStorageDesc npuDesc = storage.get_npu_desc();
+      const auto* npuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor);
 
       if (tensor.is_contiguous())
       {
         return false;
       }
 
-      if (npuDesc.npu_format_ != ACL_FORMAT_NC1HWC0)
+      if (npuDesc->npu_format_ != ACL_FORMAT_NC1HWC0)
       {
         return false;
       }
@@ -129,9 +128,9 @@ namespace at_npu
 
       int64_t contiguous_len = 16;
       int64_t c0_len = 16;
-      for (int i = 2; i < npuDesc.base_sizes_.size(); i++)
+      for (int i = 2; i < npuDesc->base_sizes_.size(); i++)
       {
-        contiguous_len *= npuDesc.base_sizes_[i];
+        contiguous_len *= npuDesc->base_sizes_[i];
       }
       bool is_offset_match = (tensor.storage_offset() % contiguous_len == 0);
       bool is_length_match = (tensor.size(1) % c0_len == 0);
@@ -151,12 +150,12 @@ namespace at_npu
       int64_t dim = 1;
 
       // 2. recovery the src tensor desc
-      const c10::NPUStorageDesc src_npuDesc = src.storage().get_npu_desc();
-      src.set_(src.storage(), 0, src_npuDesc.base_sizes_, src_npuDesc.base_strides_);
-      at::Tensor src_tmp = src.reshape({src.size(0), src.size(1) / 16, src.size(2), src.size(3) * 16});
-      src_tmp.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_ = src_tmp.sizes();
-      src_tmp.storage().unsafeGetStorageImpl()->npu_desc_.base_strides_ = src_tmp.strides();
-      src_tmp.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_ = src_tmp.sizes();
+      const auto* src_npuDesc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
+
+      src.set_(src.storage(), 0, src_npuDesc->base_sizes_, src_npuDesc->base_strides_);
+      at::Tensor src_tmp = src.reshape({src.size(0),src.size(1)/16,src.size(2),src.size(3)*16});
+      StorageDescHelper::ReflushDescBySelf(src_tmp);
+
       // std::cout << "src_tmp storage_offset(): " << src_tmp.storage_offset() << std::endl;
       // std::cout << "src_tmp sizes(): " << src_tmp.sizes() << std::endl;
       // std::cout << "src_tmp strides(): " << src_tmp.strides() << std::endl;
@@ -192,23 +191,20 @@ namespace at_npu
     }
     void NpuUtils::RefreshFormat(const at::Tensor &tensor)
     {
-      auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_;
-      if (tensor_desc.storage_sizes_.size() == 4 && tensor_desc.npu_format_ == ACL_FORMAT_ND)
-      {
-        tensor_desc.npu_format_ = ACL_FORMAT_NCHW;
-        tensor_desc.origin_format_ = ACL_FORMAT_NCHW;
-      }
-      else if (tensor_desc.storage_sizes_.size() != 4 && tensor_desc.npu_format_ == ACL_FORMAT_NCHW)
-      {
-        tensor_desc.npu_format_ = ACL_FORMAT_ND;
-        tensor_desc.origin_format_ = ACL_FORMAT_ND;
+      auto* tensor_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor);
+      if (tensor_desc->storage_sizes_.size() == 4 && tensor_desc->npu_format_ == ACL_FORMAT_ND) {
+        tensor_desc->npu_format_ = ACL_FORMAT_NCHW;
+        tensor_desc->origin_format_ = ACL_FORMAT_NCHW;
+      } else if (tensor_desc->storage_sizes_.size() != 4 && tensor_desc->npu_format_ == ACL_FORMAT_NCHW) {
+        tensor_desc->npu_format_ = ACL_FORMAT_ND;
+        tensor_desc->origin_format_ = ACL_FORMAT_ND;
       }
     }
 
     at::Tensor deal_with_5d_5d_match(const at::Tensor &src)
     {
-      auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-      at::Tensor src_new = at::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
+      auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
+      at::Tensor src_new = at::empty_with_format(src_desc->base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
       c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream();
       int64_t numel = src_new.numel();
       aclError error = aclrtMemcpyAsync(
@@ -220,14 +216,15 @@ namespace at_npu
           copy_stream);
       src_new.set_(src_new.storage(), src.storage_offset(), src.sizes(), src.strides());
 
-      src_new.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ = ACL_FORMAT_NCHW;
+      torch_npu::NPUTensorImpl::GetStorageInfo(src_new)->npu_format_ = ACL_FORMAT_NCHW;
+
       at::Tensor ret = convert_continue_using_gatherv2_improve(src_new);
+
       // std::cout << "ret data_recovery: " << ret.to(at::kCPU) << std::endl;
-      at::Tensor ret_tmp = ret.reshape({ret.size(0), ret.size(1) * 16, ret.size(2), ret.size(3) / 16});
-      ret_tmp.storage().unsafeGetStorageImpl()->npu_desc_.base_sizes_ = ret_tmp.sizes();
-      ret_tmp.storage().unsafeGetStorageImpl()->npu_desc_.base_strides_ = ret_tmp.strides();
-      ret_tmp.storage().unsafeGetStorageImpl()->npu_desc_.storage_sizes_ = ret_tmp.sizes();
-      ret_tmp.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ = ACL_FORMAT_NC1HWC0;
+      at::Tensor ret_tmp = ret.reshape({ret.size(0),ret.size(1)*16,ret.size(2),ret.size(3)/16});
+
+      StorageDescHelper::ReflushDescBySelf(ret_tmp);
+      torch_npu::NPUTensorImpl::GetStorageInfo(ret_tmp)->npu_format_ = ACL_FORMAT_NC1HWC0;
       return ret_tmp;
     }
 
@@ -245,19 +242,14 @@ namespace at_npu
 
     at::Tensor metadata_convert_match(const at::Tensor &src)
     {
-      auto &src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-      bool numelEq = (src.numel() == at::prod_intlist(src_desc.base_sizes_));
+      auto* src_desc = torch_npu::NPUTensorImpl::GetStorageInfo(src);
+      bool numelEq = (src.numel() == at::prod_intlist(src_desc->base_sizes_));
       // Only when a tensor monopolizes a storage can NpuStorageDesc be refreshed.
       // When the original format is not NCHW, the npu_format_cast to NCHW will generate
       // a temporary tensor, which always monopolizes its own storage.
-      if (numelEq && (!FormatHelper::IsBaseFormatType(src)))
-      {
+      if (numelEq && (!FormatHelper::IsBaseFormatType(src))) {
         at::Tensor tempTensor = at::npu_format_cast(src, FormatHelper::GetBaseFormat(src));
-        auto &temp_desc =
-            tempTensor.storage().unsafeGetStorageImpl()->npu_desc_;
-        temp_desc.base_sizes_ = tempTensor.sizes();
-        temp_desc.base_strides_ = tempTensor.strides();
-        temp_desc.storage_sizes_ = tempTensor.sizes();
+        StorageDescHelper::ReflushDescBySelf(tempTensor);
         NpuUtils::RefreshFormat(tempTensor);
         return tempTensor;
       }
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index bf70fb7277..cc09d545a6 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -17,6 +17,7 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
 {
@@ -189,15 +190,15 @@ namespace at_npu
 
     at::Tensor OpPreparation::CastBackToOriFormat(const at::Tensor &tensor)
     {
-      auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_;
-      auto ret = NPUNativeFunctions::npu_format_cast(tensor, tensor_desc.origin_format_);
+      auto* tensor_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor);
+      auto ret = NPUNativeFunctions::npu_format_cast(tensor, tensor_desc->origin_format_);
       return ret;
     }
 
     at::Tensor &OpPreparation::CastBackToOriFormat(at::Tensor &tensor)
     {
-      auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_;
-      tensor.npu_format_cast_(tensor_desc.origin_format_);
+      auto* tensor_desc = torch_npu::NPUTensorImpl::GetStorageInfo(tensor);
+      tensor.npu_format_cast_(tensor_desc->origin_format_);
       return tensor;
     }
 
-- 
Gitee


From bb578c99cfc5f33dc5b47fc90307845bae4b0956 Mon Sep 17 00:00:00 2001
From: "zhousinan@huawei.com" <zhousinan@huawei.com>
Date: Fri, 28 Jan 2022 15:14:43 +0800
Subject: [PATCH 3/6] add
 torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp

---
 torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp | 4 ++--
 torch_npu/csrc/framework/OpParamMaker.h                | 2 +-
 torch_npu/csrc/framework/StorageDescHelper.cpp         | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
index 900c664fc5..38c30fdf9e 100644
--- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
@@ -66,9 +66,9 @@ namespace at_npu
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // use 5HD in Relu
-  if ((torch_npu::NPUTensorImpl::GetStorageInfo(grad_output)->npu_format_ ==
+    if ((torch_npu::NPUTensorImpl::GetStorageInfo(grad_output)->npu_format_ ==
            ACL_FORMAT_NCHW) &&
-      (torch_npu::NPUTensorImpl::GetStorageInfo(self)->npu_format_ ==
+          (torch_npu::NPUTensorImpl::GetStorageInfo(self)->npu_format_ ==
            ACL_FORMAT_NC1HWC0))
       {
         at::Tensor grad_output_5HD =
diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h
index 7163626aaf..fea67d7497 100644
--- a/torch_npu/csrc/framework/OpParamMaker.h
+++ b/torch_npu/csrc/framework/OpParamMaker.h
@@ -17,13 +17,13 @@
 #define __PULGIN_NATIVE_UTILS_OP_PARAM_MAKER__
 
 #include <c10/npu/NPUStream.h>
+#include <c10/npu/NPUCachingAllocator.h>
 
 #include "third_party/acl/inc/acl/acl_base.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
 #include "torch_npu/csrc/framework/NPUDefine.h"
 #include "torch_npu/csrc/framework/interface/Graph.h"
 #include "torch_npu/csrc/core/tensor_impl.h"
-#include "c10/npu/NPUCachingAllocator.h"
 
 namespace at_npu
 {
diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp
index a43e24d28b..e438cdb7e1 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.cpp
+++ b/torch_npu/csrc/framework/StorageDescHelper.cpp
@@ -106,7 +106,7 @@ namespace at_npu
     }
 
     void StorageDescHelper::SetDesc(at::Tensor& dst, c10::IntArrayRef size, c10::IntArrayRef strides)
-	{
+    {
       if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) {
         return;
       }
@@ -116,7 +116,7 @@ namespace at_npu
 
     void StorageDescHelper::SetDesc(at::Tensor &dst, c10::IntArrayRef size, c10::IntArrayRef strides, aclFormat format)
     {
-	  if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) {
+	    if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) {
         return;
       }
 
@@ -133,7 +133,8 @@ namespace at_npu
       CopyDesc(dst, *torch_npu::NPUTensorImpl::GetStorageInfo(*src.unsafeGetStorageImpl()));
     }
 
-    void StorageDescHelper::CopyDesc(const at::Tensor& dst, const c10::npu::NPUCachingAllocator::NPUStorageInfo& src_desc) {
+    void StorageDescHelper::CopyDesc(const at::Tensor& dst, const c10::npu::NPUCachingAllocator::NPUStorageInfo& src_desc)
+    {
       if (dst.storage().unsafeGetStorageImpl()->data() == nullptr) {
         return;
       }
@@ -143,7 +144,7 @@ namespace at_npu
     }
 
     void StorageDescHelper::ReflushDescBySelf(const at::Tensor& src)
-	{
+	  {
       if (src.storage().unsafeGetStorageImpl()->data() == nullptr) {
         return;
       }
-- 
Gitee


From b4966d023d890f2b9867e4a6279af0e4dbc44ccc Mon Sep 17 00:00:00 2001
From: "zhousinan@huawei.com" <zhousinan@huawei.com>
Date: Sat, 29 Jan 2022 10:07:04 +0800
Subject: [PATCH 4/6] add torch_npu/csrc/framework/OpParamMaker.h

---
 torch_npu/csrc/framework/OpParamMaker.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h
index e2b18e9309..37ab2b2bb3 100644
--- a/torch_npu/csrc/framework/OpParamMaker.h
+++ b/torch_npu/csrc/framework/OpParamMaker.h
@@ -21,6 +21,7 @@
 
 #include "third_party/acl/inc/acl/acl_base.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
+#include "torch_npu/csrc/framework/NPUDefine.h"
 #include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu
@@ -66,7 +67,7 @@ namespace at_npu
       AclTensorDescMaker() {}
       ~AclTensorDescMaker() = default;
 
-      AclTensorDescMaker &Create(aclDataType dataType, c10::NPUStorageDesc storageDesc)
+      AclTensorDescMaker &Create(aclDataType dataType, c10::npu::NPUCachingAllocator::NPUStorageDesc storageDesc)
       {
         auto dims = storageDesc.base_sizes_;
         auto format = storageDesc.origin_format_;
-- 
Gitee


From 8ff1f92cb8ded363abeb6161c55219c9b6669c88 Mon Sep 17 00:00:00 2001
From: "zhousinan@huawei.com" <zhousinan@huawei.com>
Date: Sat, 29 Jan 2022 10:11:21 +0800
Subject: [PATCH 5/6] add torch_npu/csrc/framework/OpParamMaker.h

---
 torch_npu/csrc/framework/OpParamMaker.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h
index 37ab2b2bb3..545d658b19 100644
--- a/torch_npu/csrc/framework/OpParamMaker.h
+++ b/torch_npu/csrc/framework/OpParamMaker.h
@@ -67,7 +67,7 @@ namespace at_npu
       AclTensorDescMaker() {}
       ~AclTensorDescMaker() = default;
 
-      AclTensorDescMaker &Create(aclDataType dataType, c10::npu::NPUCachingAllocator::NPUStorageDesc storageDesc)
+      AclTensorDescMaker &Create(aclDataType dataType, c10::npu::NPUCachingAllocator::NPUStorageInfo storageDesc)
       {
         auto dims = storageDesc.base_sizes_;
         auto format = storageDesc.origin_format_;
-- 
Gitee


From 27d25fadb09f91f1fa281f7415806ff22f3acfb3 Mon Sep 17 00:00:00 2001
From: "zhousinan@huawei.com" <zhousinan@huawei.com>
Date: Mon, 7 Feb 2022 15:12:22 +0800
Subject: [PATCH 6/6] add torch_npu/csrc/aten/common/TensorShape.cpp

---
 torch_npu/csrc/aten/common/TensorShape.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/torch_npu/csrc/aten/common/TensorShape.cpp b/torch_npu/csrc/aten/common/TensorShape.cpp
index 4b87048538..ccb29842e4 100644
--- a/torch_npu/csrc/aten/common/TensorShape.cpp
+++ b/torch_npu/csrc/aten/common/TensorShape.cpp
@@ -33,6 +33,7 @@
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/aten/common/FormatCastHelper.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/tensor_impl.h"
 
 namespace at_npu {
 namespace native {
@@ -52,9 +53,8 @@ at::Tensor alias_with_sizes_and_strides_npu(
     impl->set_sizes_and_strides(sizes, strides);
     self_ = at::Tensor(std::move(impl));
   } else {
-    auto impl = c10::make_intrusive<at::TensorImpl>(
+    auto impl = c10::make_intrusive<torch_npu::NPUTensorImpl>(
         c10::Storage(self.storage()),
-        self.key_set(),
         self.dtype());
     impl->set_storage_offset(self.storage_offset());
     impl->set_sizes_and_strides(sizes, strides);
@@ -91,9 +91,8 @@ at::Tensor NPUNativeFunctions::as_strided(
     dst = FormatCastHelper::ApplyBaseFormatTensorBy(dst);
   }
   auto storage_offset = storage_offset_.value_or(dst.storage_offset());
-  auto result = at::detail::make_tensor<at::TensorImpl>(
+  auto result = at::detail::make_tensor<torch_npu::NPUTensorImpl>(
       c10::Storage(dst.storage()),
-      dst.key_set(),
       dst.dtype());
   at::native::setStrided(result, size, stride, storage_offset);
   return result;
-- 
Gitee