From 413e33e6086f19b6ef576e3d092d709227ebebb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Fri, 11 Feb 2022 11:50:55 +0800
Subject: [PATCH 01/12] Add License for version file.

---
 torch_npu/version.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/torch_npu/version.py b/torch_npu/version.py
index 32fb6cf7bf..8790208db4 100644
--- a/torch_npu/version.py
+++ b/torch_npu/version.py
@@ -1 +1,16 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 __version__ = "1.8.1rc1"
-- 
Gitee


From 83bb19062c26ed22fce1cebc77ad2b89fd142a00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Sat, 12 Feb 2022 11:12:53 +0800
Subject: [PATCH 02/12] Fix calling custom ops.

---
 .../PyTorch Operator Development Guide.md             |  6 +++---
 ...45\274\200\345\217\221\346\214\207\345\215\227.md" |  6 +++---
 torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp    |  4 ++--
 torch_npu/csrc/aten/common/TensorFactories.cpp        | 11 ++++++-----
 torch_npu/csrc/aten/ops/AddKernelNpu.cpp              |  6 +++---
 torch_npu/csrc/aten/ops/AnyKernelNpu.cpp              |  4 ++--
 torch_npu/csrc/aten/ops/BmmKernelNpu.cpp              |  4 ++--
 torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp        |  2 +-
 torch_npu/csrc/aten/ops/DivKernelNpu.cpp              |  4 ++--
 torch_npu/csrc/aten/ops/EqKernelNpu.cpp               |  8 ++++----
 torch_npu/csrc/aten/ops/GtKernelNpu.cpp               |  8 ++++----
 torch_npu/csrc/aten/ops/LtKernelNpu.cpp               |  8 ++++----
 torch_npu/csrc/aten/ops/MeanKernelNpu.cpp             |  2 +-
 torch_npu/csrc/aten/ops/MmKernelNpu.cpp               |  4 ++--
 torch_npu/csrc/aten/ops/MulKernelNpu.cpp              |  4 ++--
 torch_npu/csrc/aten/ops/NegKernelNpu.cpp              |  2 +-
 torch_npu/csrc/aten/ops/NormalKernelNpu.cpp           |  2 +-
 torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp         |  2 +-
 torch_npu/csrc/aten/ops/ReluKernelNpu.cpp             |  2 +-
 torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp        |  2 +-
 torch_npu/csrc/aten/ops/SubKernelNpu.cpp              |  4 ++--
 torch_npu/csrc/aten/ops/SumKernelNpu.cpp              |  2 +-
 .../csrc/aten/ops/ThresholdBackwardKernelNpu.cpp      |  2 +-
 torch_npu/csrc/aten/ops/TopKKernelNpu.cpp             |  8 ++++----
 torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp        |  2 +-
 torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp        |  9 +++------
 .../aten/ops/convolution/Conv2dBackwardKernelNpu.cpp  |  8 ++++----
 .../aten/ops/convolution/Conv3dBackwardKernelNpu.cpp  |  6 +++---
 .../aten/ops/convolution/ConvTranspose2dKernelNpu.cpp |  2 +-
 .../csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp   |  2 +-
 torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp     |  4 ++--
 torch_npu/csrc/distributed/reducer.cpp                |  7 ++++---
 torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp |  2 +-
 torch_npu/csrc/framework/contiguous/combined_opt.cpp  |  4 +++-
 torch_npu/csrc/framework/utils/NpuUtils.cpp           |  6 +++---
 torch_npu/csrc/framework/utils/OpPreparation.cpp      |  6 ++++--
 36 files changed, 84 insertions(+), 81 deletions(-)

diff --git a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md
index 48ddd58898..698b2ac1c7 100644
--- a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md	
+++ b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md	
@@ -515,7 +515,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a
             Scalar other_c1_offset(
                 other.storage_offset() / (other.size(2) * other.size(3) * c0_len));
             Scalar stride_len(self.size(1) / c0_len);
-            Tensor result = at::npu_stride_add(
+            Tensor result = NPUNativeFunctions::npu_stride_add(
                 self_use, other_use, self_c1_offset, other_c1_offset, stride_len);
             return result;
           }
@@ -524,7 +524,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a
           auto outputSize = broadcast_ops_npu_output_size(self, other);
         
           // construct the output tensor of the NPU
-          Tensor result = at::empty_with_format(
+          Tensor result = NPUNativeFunctions::empty_with_format(
               outputSize,
               outputTensor.options(),
               CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -541,7 +541,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a
           // calculate the output size
           auto outputSize = input_same_output_size(self);
           // construct the output tensor of the NPU
-          Tensor result = at::empty_with_format(
+          Tensor result = NPUNativeFunctions::empty_with_format(
               outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
         
           // calculate the output result of the NPU
diff --git "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
index f48ac35865..d38aa6efd3 100644
--- "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
+++ "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
@@ -515,7 +515,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。
             Scalar other_c1_offset(
                 other.storage_offset() / (other.size(2) * other.size(3) * c0_len));
             Scalar stride_len(self.size(1) / c0_len);
-            Tensor result = at::npu_stride_add(
+            Tensor result = NPUNativeFunctions::npu_stride_add(
                 self_use, other_use, self_c1_offset, other_c1_offset, stride_len);
             return result;
           }
@@ -524,7 +524,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。
           auto outputSize = broadcast_ops_npu_output_size(self, other);
         
           // construct the output tensor of the NPU
-          Tensor result = at::empty_with_format(
+          Tensor result = NPUNativeFunctions::empty_with_format(
               outputSize,
               outputTensor.options(),
               CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -541,7 +541,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。
           // calculate the output size
           auto outputSize = input_same_output_size(self);
           // construct the output tensor of the NPU
-          Tensor result = at::empty_with_format(
+          Tensor result = NPUNativeFunctions::empty_with_format(
               outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
         
           // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
index 2139cfdb4a..506686f3c6 100644
--- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
@@ -76,7 +76,7 @@ at::Tensor NPUNativeFunctions::npu_format_cast(
   TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half,
       "can not cast format when src is not float32 or float16");
 
-  at::Tensor dst = at::empty_with_format(
+  at::Tensor dst = NPUNativeFunctions::empty_with_format(
       src_desc.base_sizes_, src.options(), acl_format);
 
   // calculate the output result of the NPU
@@ -105,7 +105,7 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(
   TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half,
       "can not cast format when src is not float32 or float16");
 
-  at::Tensor dst = at::empty_with_format(
+  at::Tensor dst = NPUNativeFunctions::empty_with_format(
       src_desc.base_sizes_, src.options(), acl_format);
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index 3d95b195a1..fdb4634709 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -30,11 +30,12 @@
 #include <ATen/NamedTensorUtils.h>
 #include <c10/util/Exception.h>
 #include <c10/npu/NPUCachingAllocator.h>
+#include <ATen/record_function.h>
+
 #include "torch_npu/csrc/aten/common/ResizeNpu.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h"
-#include <ATen/record_function.h>
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/core/tensor_impl.h"
@@ -230,7 +231,7 @@ namespace at_npu
         {
           auto npu_format =
               self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_;
-          result = at::empty_with_format(self.sizes(), self.options(), npu_format);
+          result = NPUNativeFunctions::empty_with_format(self.sizes(), self.options(), npu_format);
         }
       }
 
@@ -347,7 +348,7 @@ namespace at_npu
       options.layout(layout_opt);
       options.pinned_memory(pin_memory_opt);
       at::Tensor result =
-          at::empty_with_format(size, options, dst_format);
+          NPUNativeFunctions::empty_with_format(size, options, dst_format);
       if (names.has_value())
       {
         internal_set_names_inplace(result, names);
@@ -361,7 +362,7 @@ namespace at_npu
                                      int64_t dst_format)
     {
       at::Tensor result =
-          at::empty_with_format(size, options, dst_format);
+          NPUNativeFunctions::empty_with_format(size, options, dst_format);
       if (names.has_value())
       {
         internal_set_names_inplace(result, names);
@@ -376,7 +377,7 @@ namespace at_npu
                                           int64_t dst_format)
     {
       at::Tensor result =
-          at::empty_with_format(size, options, dst_format);
+          NPUNativeFunctions::empty_with_format(size, options, dst_format);
       if (names.has_value())
       {
         internal_set_names_inplace(result, names);
diff --git a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
index 363599e38a..08793a06e7 100644
--- a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
@@ -170,7 +170,7 @@ namespace at_npu
       else
       {
         c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-        at::Tensor src_new = at::empty_with_format(
+        at::Tensor src_new = NPUNativeFunctions::empty_with_format(
             src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
         src_new.set_(
             src.storage(),
@@ -206,7 +206,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(self, other);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -223,7 +223,7 @@ namespace at_npu
       // calculate the output size
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp
index 7cdac57d78..63b1271f93 100644
--- a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp
@@ -70,7 +70,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self, int64_t dim, bool kee
   auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim);
 
   // construct the output tensor of the NPU
-  at::Tensor result = at::empty_with_format(
+  at::Tensor result = NPUNativeFunctions::empty_with_format(
       outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
   // calculate the output result of the NPU  
@@ -88,7 +88,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self) {
   // when self's dim = 0, convert [1] tensor and reduce it
   if (self.dim() == 0) {
       at::Tensor self_tmp = self;
-      self_tmp = at::empty_with_format(
+      self_tmp = NPUNativeFunctions::empty_with_format(
           {1}, 
           self.options().dtype(at::ScalarType::Float), 
           CalcuOpUtil::get_tensor_npu_format(self))
diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
index 38aceb87fd..c93fbe3cc8 100644
--- a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
@@ -73,9 +73,9 @@ at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat
   // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去
   if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) &&
       !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) {
-    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
+    result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
   } else {
-    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND);
+    result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND);
   }
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp
index 79c6fdae29..5903959bab 100644
--- a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp
@@ -53,7 +53,7 @@ namespace at_npu
         input = input.to(at::kInt);
       }
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           size,
           input.options(),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp
index a033c8c511..76865efccb 100644
--- a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp
@@ -85,7 +85,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(self, other);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -102,7 +102,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           self.options(),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp
index 3dff364228..90b74193dc 100644
--- a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp
@@ -95,7 +95,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -113,7 +113,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -131,7 +131,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           self.sizes(),
           self.options().dtype(c10::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
@@ -159,7 +159,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           self.sizes(),
           self.options().dtype(c10::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp
index 50d4c3fbcd..d27161e162 100644
--- a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp
@@ -104,7 +104,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -121,7 +121,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -139,7 +139,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
@@ -167,7 +167,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
index 29cf56e55e..7ccd7c64a9 100644
--- a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
@@ -100,7 +100,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool));
 
@@ -116,7 +116,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool));
 
@@ -133,7 +133,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
@@ -161,7 +161,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp
index 0f6e2b6875..54a5768348 100644
--- a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp
@@ -142,7 +142,7 @@ namespace at_npu
       }
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize, self.options().dtype(dstType), npu_format);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
index e836bee17c..19aa9ccd83 100644
--- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
@@ -183,12 +183,12 @@ Return:
 
       if ((self.scalar_type() == at::ScalarType::Half) && !c10::npu::OptionsManager::CheckSwitchMMOutputEnable())
       {
-        result = at::empty_with_format(
+        result = NPUNativeFunctions::empty_with_format(
             outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
       }
       else
       {
-        result = at::empty_with_format(outputSize, self.options());
+        result = NPUNativeFunctions::empty_with_format(outputSize, self.options());
       }
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp
index 7e6403e1fc..c945f876a4 100644
--- a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp
@@ -114,7 +114,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(selfCast, otherCast);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -136,7 +136,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp
index a04ed1ff90..204a62f337 100644
--- a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp
@@ -50,7 +50,7 @@ namespace at_npu
     at::Tensor NPUNativeFunctions::neg(const at::Tensor &self)
     {
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp
index 5135cd4a27..65063a67b8 100644
--- a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp
@@ -182,7 +182,7 @@ namespace at_npu
         c10::optional<bool> pin_memory_opt)
     {
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           size, dtype_opt, layout_opt, device_opt, pin_memory_opt, ACL_FORMAT_ND);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
index 12dd1e7542..a4edb6cad0 100644
--- a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
@@ -45,7 +45,7 @@ namespace at_npu
 
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(outputSize,
+      at::Tensor result = NPUNativeFunctions::empty_with_format(outputSize,
                                                 dtype_opt,
                                                 layout_opt,
                                                 device_opt,
diff --git a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
index d8ab551733..410f091e60 100644
--- a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
@@ -65,7 +65,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
index 79783b2f4c..ca6f03c9c7 100644
--- a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
@@ -119,7 +119,7 @@ namespace at_npu
       outputSize[1] = c1_len.toInt() * 16;
 
       // construct the output at::Tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp
index fe90315466..914664cd96 100644
--- a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp
@@ -103,7 +103,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(self, other);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -120,7 +120,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp
index 48129f2a6b..92012d2da5 100644
--- a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp
@@ -197,7 +197,7 @@ namespace at_npu
       }
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize, self.options().dtype(dstType), npu_format);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
index 537bcc2444..3a4bdabee3 100644
--- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
@@ -62,7 +62,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // use 5HD in Relu
diff --git a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp
index 0bddf27eae..3f34261d8d 100644
--- a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp
@@ -219,11 +219,11 @@ namespace at_npu
         // construct the output tensor of the NPU
         at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm);
         auto outputSize = transpose_npu_output_size(values, perm);
-        at::Tensor transposeValue = at::empty_with_format(
+        at::Tensor transposeValue = NPUNativeFunctions::empty_with_format(
             outputSize,
             values.options(),
             CalcuOpUtil::get_tensor_npu_format(values));
-        at::Tensor transposeIndices = at::empty_with_format(
+        at::Tensor transposeIndices = NPUNativeFunctions::empty_with_format(
             outputSize,
             indices.options(),
             CalcuOpUtil::get_tensor_npu_format(indices));
@@ -290,9 +290,9 @@ namespace at_npu
       // calculate the output size
       auto outputSize = topk_npu_output_size(selfCp, k, dim, largest, sorted);
       // construct the output tensor of the NPU
-      at::Tensor values = at::empty_with_format(
+      at::Tensor values = NPUNativeFunctions::empty_with_format(
           outputSize, selfCp.options(), CalcuOpUtil::get_tensor_npu_format(selfCp));
-      at::Tensor indices = at::empty_with_format(
+      at::Tensor indices = NPUNativeFunctions::empty_with_format(
           outputSize, selfCp.options().dtype(at::kInt), ACL_FORMAT_ND);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp
index fcda11d5bc..085c49f0fe 100644
--- a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp
@@ -76,7 +76,7 @@ namespace at_npu
     {
       RECORD_FUNCTION("transpose_to_contiguous", vector<c10::IValue>({self}));
       int64_t self_format = CalcuOpUtil::get_tensor_npu_format(self);
-      at::Tensor result = at::empty_with_format(self.sizes(), self.options(), self_format);
+      at::Tensor result = NPUNativeFunctions::empty_with_format(self.sizes(), self.options(), self_format);
 
       // obtain the transpose axises
       at::IntArrayRef dim;
diff --git a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp
index 18ec3038d1..2f4775751d 100644
--- a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp
@@ -56,12 +56,9 @@ namespace at_npu
 
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(outputSize,
-                                                dtype_opt,
-                                                layout_opt,
-                                                device_opt,
-                                                pin_memory_opt,
-                                                CalcuOpUtil::get_tensor_npu_format(self));
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
+        outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt,
+        CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
       return result.zero_();
diff --git a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp
index dcddfa5d65..f6eb56eca5 100644
--- a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp
@@ -213,7 +213,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv2d_backwar
   at::Tensor gradBias;
   // construct the output tensor of the NPU
   if (grad_input_mask[0]) {
-    gradInput = at::empty_with_format(
+    gradInput = NPUNativeFunctions::empty_with_format(
         std::get<0>(outputSizes), input.options(), ACL_FORMAT_NC1HWC0);
   }
 
@@ -221,12 +221,12 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv2d_backwar
     // For group conv2d: keep consistent with weight to avoid allreduce accuracy problem.
     // For more info: https://gitee.com/ascend/pytorch-develop/pulls/2255
     if (groups > 1) {
-      gradWeight = at::empty_with_format(
+      gradWeight = NPUNativeFunctions::empty_with_format(
           std::get<1>(outputSizes),
           weight.options().dtype(at::kFloat),
           ACL_FORMAT_NCHW);      
     } else {
-      gradWeight = at::empty_with_format(
+      gradWeight = NPUNativeFunctions::empty_with_format(
           std::get<1>(outputSizes),
           weight.options().dtype(at::kFloat),
           ACL_FORMAT_FRACTAL_Z);      
@@ -234,7 +234,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv2d_backwar
   }
 
   if (grad_input_mask[2]) {
-    gradBias = at::empty_with_format(
+    gradBias = NPUNativeFunctions::empty_with_format(
         std::get<2>(outputSizes), grad.options(), ACL_FORMAT_NCHW);
   }
 
diff --git a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp
index 48be4e0d61..f9ade9488c 100644
--- a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp
@@ -107,7 +107,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv3d_backwar
 
   if (grad_input_mask[0]) {
     // format should be NDC1HWC0
-    gradInput = at::empty_with_format(
+    gradInput = NPUNativeFunctions::empty_with_format(
         input.sizes(), input.options(), ACL_FORMAT_NDC1HWC0);
 
     conv3d_backward_inputmask(
@@ -116,7 +116,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv3d_backwar
 
   if (grad_input_mask[1]) {
     // format should be FRACTAL_Z_3D
-    gradWeight = at::empty_with_format(
+    gradWeight = NPUNativeFunctions::empty_with_format(
         weight.sizes(), weight.options().dtype(at::kFloat), ACL_FRACTAL_Z_3D);
 
     conv3d_backward_weightmask(
@@ -125,7 +125,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv3d_backwar
 
   if (grad_input_mask[2]) {
     // format should be NCHW, gradias.size = grad.size(1)
-    gradBias = at::empty_with_format(
+    gradBias = NPUNativeFunctions::empty_with_format(
         {grad.size(1)}, grad.options(), ACL_FORMAT_NCHW);
 
     conv3d_backward_biasmask(
diff --git a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp
index c6e2f78378..8e60437207 100644
--- a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp
@@ -81,7 +81,7 @@ at::Tensor NPUNativeFunctions::npu_conv_transpose2d(
 
   // construct the output tensor of the NPU
   at::Tensor result =
-      at::empty_with_format(outputSize, input.options(), ACL_FORMAT_NC1HWC0);
+      NPUNativeFunctions::empty_with_format(outputSize, input.options(), ACL_FORMAT_NC1HWC0);
 
   // calculate the output result of the NPU
   conv_transpose2d_out_npu(
diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp
index 5e31d05824..e49f4ba62e 100644
--- a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp
@@ -91,7 +91,7 @@ at::Tensor NPUNativeFunctions::nll_loss_backward(
   auto outputSize = input_same_output_size(self);
 
   // construct the output tensor of the NPU
-  at::Tensor grad_input = at::empty_with_format(
+  at::Tensor grad_input = NPUNativeFunctions::empty_with_format(
       outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp
index cfda03265f..418b2b296a 100644
--- a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp
@@ -96,11 +96,11 @@ tuple<at::Tensor, at::Tensor> NPUNativeFunctions::nll_loss_forward(
       outputSize, totalWeightSize);
 
   // construct the output tensor of the NPU
-  at::Tensor result = at::empty_with_format(
+  at::Tensor result = NPUNativeFunctions::empty_with_format(
       std::get<0>(outputSizes),
       self.options(),
       CalcuOpUtil::get_tensor_npu_format(self));
-  at::Tensor total_weight = at::empty_with_format(
+  at::Tensor total_weight = NPUNativeFunctions::empty_with_format(
       std::get<1>(outputSizes),
       self.options(),
       CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index b31bd008d0..5c2b3b4ab7 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -31,6 +31,7 @@
 #include <torch/csrc/utils/memory.h>
 
 #include "torch_npu/csrc/distributed/reducer.hpp"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace c10d_npu {
 namespace {
@@ -1072,9 +1073,9 @@ void Reducer::copy_bucket_to_grad(
       if (!grad.defined()) {
         // Creates grad according to the "Gradient Layout Contract"
         // (see torch/csrc/grad/AccumulateGrad.h)
-        grad = at::empty_with_format(variable.sizes(),
-                                     bucket_view.options(),
-                                     variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
+        grad = NPUNativeFunctions::empty_with_format(
+          variable.sizes(), bucket_view.options(),
+          variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
         grad.copy_memory_(bucket_view, true);
       } else {
         grad.copy_memory_(bucket_view, true);
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
index 0854e27c09..9f918437bb 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
@@ -109,7 +109,7 @@ namespace at_npu
         const at::Tensor &src,
         const std::vector<string> &optimizations)
     {
-      auto self = at::empty_with_format(
+      auto self = NPUNativeFunctions::empty_with_format(
           src.sizes(),
           src.options(),
           src.storage().get_npu_desc().npu_format_);
diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
index cf270817f8..a0228c78ca 100644
--- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
@@ -16,8 +16,10 @@
 #include <map>
 #include <ATen/quantized/QTensorImpl.h>
 #include <ATen/NamedTensorUtils.h>
+
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace at_npu
 {
@@ -498,7 +500,7 @@ namespace at_npu
           {
             // case 2: The first tensor is discontiguous-type,
             // conduct the standard optimization procedure.
-            auto contiguous_src = at::empty_with_format(
+            auto contiguous_src = NPUNativeFunctions::empty_with_format(
                 src.sizes(),
                 src.options(),
                 src.storage().get_npu_desc().npu_format_);
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp
index a0efe852f7..b1c3a10a77 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.cpp
+++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp
@@ -16,7 +16,6 @@
 
 #include <mutex>
 #include <set>
-#include <c10/npu/register/OptionRegister.h>
 
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
@@ -25,6 +24,7 @@
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/interface/EnvVariables.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace at_npu
 {
@@ -165,7 +165,7 @@ namespace at_npu
       // 3. get output size
       auto outputSize = index_select_npu_output_size(src_tmp, dim, index);
       int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(src_tmp);
-      at::Tensor result = at::empty_with_format(outputSize, src_tmp.options(), npu_format);
+      at::Tensor result = NPUNativeFunctions::empty_with_format(outputSize, src_tmp.options(), npu_format);
       // std::cout << "npu_format: " << npu_format << std::endl;
 
       // 4. get input and output
@@ -208,7 +208,7 @@ namespace at_npu
     at::Tensor deal_with_5d_5d_match(const at::Tensor &src)
     {
       auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-      at::Tensor src_new = at::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
+      at::Tensor src_new = NPUNativeFunctions::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
       c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream();
       int64_t numel = src_new.numel();
       aclError error = aclrtMemcpyAsync(
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index bf70fb7277..be278c096f 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -17,6 +17,8 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
 
 namespace at_npu
 {
@@ -234,13 +236,13 @@ namespace at_npu
     at::Tensor OpPreparation::ApplyTensorWithFormat(c10::IntArrayRef sizes, const c10::TensorOptions &options, int64_t format)
     {
       auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format);
-      return at::empty_with_format(sizes, options, fixFormat);
+      return NPUNativeFunctions::empty_with_format(sizes, options, fixFormat);
     }
 
     at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10::TensorOptions &options)
     {
       auto format = InferFormat::GuessBaseFormat(sizes);
-      return at::empty_with_format(sizes, options, format);
+      return NPUNativeFunctions::empty_with_format(sizes, options, format);
     }
 
     void OpPreparation::CheckMemory(const std::initializer_list<at::Tensor> &inputs, const std::initializer_list<at::Tensor> &outputs)
-- 
Gitee


From 6c29a7d5c7eaa021e9c223c7dc3e75d50589fcf9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Sat, 12 Feb 2022 11:37:56 +0800
Subject: [PATCH 03/12] Update npu_format_cast.

---
 torch_npu/csrc/aten/common/CopyKernel.cpp            | 6 +++---
 torch_npu/csrc/aten/common/NpuFastReshape.cpp        | 2 +-
 torch_npu/csrc/aten/common/ResizeNpu.cpp             | 2 +-
 torch_npu/csrc/aten/ops/NormalKernelNpu.cpp          | 2 +-
 torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp | 2 +-
 torch_npu/csrc/distributed/Init.cpp                  | 2 +-
 torch_npu/csrc/distributed/reducer.cpp               | 2 +-
 torch_npu/csrc/framework/utils/NpuUtils.cpp          | 2 +-
 torch_npu/csrc/framework/utils/OpPreparation.cpp     | 4 ++--
 torch_npu/testing/util_test.py                       | 5 +++--
 10 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp
index 5025e3accd..bb983a63d3 100644
--- a/torch_npu/csrc/aten/common/CopyKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernel.cpp
@@ -188,7 +188,7 @@ void copy_d2d_dtype_format(at::Tensor& self, const at::Tensor& src, bool non_blo
     at::Tensor src_4D = FormatCastHelper::ApplyBaseFormatTensorBy(src);
     at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self);
     copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking);
-    self.npu_format_cast_(dst_4D);
+    NPUNativeFunctions::npu_format_cast_(self, dst_4D);
     return;
   }
   copy_d2d_dtype_baseformat(self, src, non_blocking);
@@ -312,7 +312,7 @@ void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
   if (!FormatHelper::IsBaseFormatType(self)) {
     at::Tensor dst = OpPreparation::ApplyTensor(self);
     copy_h2d_baseformat(dst, src, non_blocking, true);
-    self.npu_format_cast_(dst);
+    NPUNativeFunctions::npu_format_cast_(dst);
     return;
   }
   copy_h2d_baseformat(self, src, non_blocking);
@@ -363,7 +363,7 @@ void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking)
     }
     at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self);
     copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking);
-    self.npu_format_cast_(dst_4D);
+    NPUNativeFunctions::npu_format_cast_(dst_4D);
     return;
   }
   copy_d2d_dtype_format(self, src, non_blocking);
diff --git a/torch_npu/csrc/aten/common/NpuFastReshape.cpp b/torch_npu/csrc/aten/common/NpuFastReshape.cpp
index e81d4f0c29..9fc817bfe3 100644
--- a/torch_npu/csrc/aten/common/NpuFastReshape.cpp
+++ b/torch_npu/csrc/aten/common/NpuFastReshape.cpp
@@ -45,7 +45,7 @@ void npu_fast_reshape_(at::Tensor& tensor) {
   // refresh matadata to input tensor
   StorageDescHelper::ReflushDescBySelf(tensor);
   auto base_format = InferFormat::GuessBaseFormat(tensor.sizes());
-  tensor.npu_format_cast_(base_format);
+  NPUNativeFunctions::npu_format_cast_(tensor, base_format);
 }
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/aten/common/ResizeNpu.cpp b/torch_npu/csrc/aten/common/ResizeNpu.cpp
index e05736bf2e..35faadb6f2 100644
--- a/torch_npu/csrc/aten/common/ResizeNpu.cpp
+++ b/torch_npu/csrc/aten/common/ResizeNpu.cpp
@@ -31,7 +31,7 @@ at::Tensor& NPUNativeFunctions::resize_(
   // because of resize _impl_npu_ only support at base format, so
   // no need to reflush NpuStorageDesc here.
   if (!FormatHelper::IsBaseFormatType(self)) {
-    self.npu_format_cast_(FormatHelper::GetBaseFormat(self));
+    NPUNativeFunctions::npu_format_cast_(self, FormatHelper::GetBaseFormat(self));
   }
   auto* self_ = self.unsafeGetTensorImpl();
   resize_impl_npu_(self_, size, /*strides=*/c10::nullopt);
diff --git a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp
index 65063a67b8..e8bfa33690 100644
--- a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp
@@ -118,7 +118,7 @@ namespace at_npu
       TORCH_CHECK(std > 0.0, "normal_ expects std > 0.0, but found std=", std);
 
       // the op of PTNormalFloatFloat only support format of ND
-      at::Tensor formatCastOfResult = result.npu_format_cast(ACL_FORMAT_ND);
+      at::Tensor formatCastOfResult = NPUNativeFunctions::npu_format_cast(result, ACL_FORMAT_ND);
       if (formatCastOfResult.scalar_type() == at::ScalarType::Half)
       {
         formatCastOfResult = formatCastOfResult.to(at::ScalarType::Float);
diff --git a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp
index cdbd32be1d..968e2a6419 100644
--- a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp
@@ -55,7 +55,7 @@ namespace at_npu
       // output'format must be same with grad_output
       if (CalcuOpUtil::get_tensor_npu_format(output) != CalcuOpUtil::get_tensor_npu_format(grad_output))
       {
-        output.npu_format_cast_(CalcuOpUtil::get_tensor_npu_format(grad_output));
+        NPUNativeFunctions::npu_format_cast_(output, CalcuOpUtil::get_tensor_npu_format(grad_output));
       }
 
       // construct the output tensor of the NPU
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 951ba59607..621ce5962a 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -48,7 +48,7 @@ class BroadcastWork {
 public:
   inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors) {
     static auto cast_back_to_ori_format = [](const at::Tensor &t) { 
-      return t.npu_format_cast(t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); 
+      return NPUNativeFunctions::npu_format_cast(t, t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); 
       };
     return c10::fmap(tensors, cast_back_to_ori_format);
   }
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index 5c2b3b4ab7..f9fe523862 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -442,7 +442,7 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) {
         // make sure grad has the same format as variable
         if (grad.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ !=
               variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_) {
-          grad = grad.npu_format_cast(
+          grad = NPUNativeFunctions::npu_format_cast(grad,
               variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
         }
         this->copy_grad_to_bucket(grad, bucket_view);
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp
index b1c3a10a77..f10640a1ec 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.cpp
+++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp
@@ -252,7 +252,7 @@ namespace at_npu
       // a temporary tensor, which always monopolizes its own storage.
       if (numelEq && (!FormatHelper::IsBaseFormatType(src)))
       {
-        at::Tensor tempTensor = at::npu_format_cast(src, FormatHelper::GetBaseFormat(src));
+        at::Tensor tempTensor = NPUNativeFunctions::npu_format_cast(src, FormatHelper::GetBaseFormat(src));
         auto &temp_desc =
             tempTensor.storage().unsafeGetStorageImpl()->npu_desc_;
         temp_desc.base_sizes_ = tempTensor.sizes();
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index be278c096f..1001fa0167 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -178,13 +178,13 @@ namespace at_npu
         if (output.scalar_type() == at::ScalarType::Float || output.scalar_type() == at::ScalarType::Half)
         {
           TORCH_CHECK(!is_read_write, "can not cast format when output is input");
-          output.npu_format_cast_(format);
+          NPUNativeFunctions::npu_format_cast_(output, format);
         }
         else
         {
           TORCH_CHECK(FormatHelper::IsBaseFormatType(output) && FormatHelper::IsBaseFormatType(static_cast<aclFormat>(format)),
                       "can not cast format to un-base format when output has bool dtype");
-          output.npu_format_cast_(format);
+          NPUNativeFunctions::npu_format_cast_(output, format);
         }
       }
     }
diff --git a/torch_npu/testing/util_test.py b/torch_npu/testing/util_test.py
index a460af4ae3..835814c30d 100644
--- a/torch_npu/testing/util_test.py
+++ b/torch_npu/testing/util_test.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import torch
+import torch_npu
 import numpy as np
 import os
 
@@ -42,7 +43,7 @@ def create_common_tensor(item, minValue, maxValue):
     cpu_input = torch.from_numpy(input1)
     npu_input = torch.from_numpy(input1).to(npu_device)
     if npu_format != -1:
-        npu_input = npu_input.npu_format_cast(npu_format)
+        npu_input = torch_npu.npu_format_cast(npu_input, npu_format)
     return cpu_input, npu_input
 
 
@@ -125,5 +126,5 @@ def create_dtype_tensor(shape, dtype, npu_format=-1, min_value=-5, max_value=5,
     cpu_input = torch.from_numpy(x)
     npu_input = torch.from_numpy(x).to(npu_device)
     if npu_format != -1 and (dtype in [torch.float, torch.half]):
-        npu_input = npu_input.npu_format_cast(npu_format)
+        npu_input = torch_npu.npu_format_cast(npu_input, npu_format)
     return cpu_input, npu_input
\ No newline at end of file
-- 
Gitee


From 88ca0d29ab0e2f0506507e56e0076a07e1c4e0d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Sat, 12 Feb 2022 11:41:16 +0800
Subject: [PATCH 04/12] Update copy_memory_.

---
 torch_npu/csrc/aten/common/FormatCastHelper.cpp | 2 +-
 torch_npu/csrc/distributed/reducer.cpp          | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
index aa98978e8a..2ee080d215 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
@@ -28,7 +28,7 @@ bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor&
 
 void FormatCastHelper::base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src) {
   dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
-  dst.copy_memory_(src, true);
+  NPUNativeFunctions::copy_memory_(dst, src, true);
 }
 
 void FormatCastHelper::format_cast_as_base_format(const at::Tensor& src, aclFormat format) {
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index f9fe523862..4a29c1ce45 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -407,9 +407,9 @@ void Reducer::copy_grad_to_bucket(
   if (comm_hook_ == nullptr) {
     // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp
     // Divides while copying into the bucket view.
-    bucket_view.copy_memory_(grad.mul(float(1.) / divFactor_), true);
+    NPUNativeFunctions::copy_memory_(bucket_view, grad.mul(float(1.) / divFactor_), true);
   } else {
-    bucket_view.copy_memory_(grad, true);
+    NPUNativeFunctions::copy_memory_(bucket_view, grad, true);
   }
 }
 
@@ -1076,9 +1076,9 @@ void Reducer::copy_bucket_to_grad(
         grad = NPUNativeFunctions::empty_with_format(
           variable.sizes(), bucket_view.options(),
           variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
-        grad.copy_memory_(bucket_view, true);
+        NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
       } else {
-        grad.copy_memory_(bucket_view, true);
+        NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
       }
       // The grad is modified and needs to be written back.
       return true;
-- 
Gitee


From 4e1d35f0abc69814d3537647dc6b4285e7406df6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Sat, 12 Feb 2022 15:57:44 +0800
Subject: [PATCH 05/12] Replace empty_with_format with ApplyTensor.

---
 .../PyTorch Operator Development Guide.md           |  5 +++--
 ...\274\200\345\217\221\346\214\207\345\215\227.md" |  4 ++--
 torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp  |  4 ++--
 torch_npu/csrc/aten/common/TensorFactories.cpp      | 10 +++++-----
 torch_npu/csrc/aten/ops/AddKernelNpu.cpp            |  6 +++---
 torch_npu/csrc/aten/ops/AnyKernelNpu.cpp            |  4 ++--
 torch_npu/csrc/aten/ops/BmmKernelNpu.cpp            |  4 ++--
 torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp      |  2 +-
 torch_npu/csrc/aten/ops/DivKernelNpu.cpp            |  4 ++--
 torch_npu/csrc/aten/ops/EqKernelNpu.cpp             |  8 ++++----
 torch_npu/csrc/aten/ops/GtKernelNpu.cpp             |  8 ++++----
 torch_npu/csrc/aten/ops/LtKernelNpu.cpp             |  8 ++++----
 torch_npu/csrc/aten/ops/MeanKernelNpu.cpp           |  2 +-
 torch_npu/csrc/aten/ops/MmKernelNpu.cpp             |  4 ++--
 torch_npu/csrc/aten/ops/MulKernelNpu.cpp            |  4 ++--
 torch_npu/csrc/aten/ops/NegKernelNpu.cpp            |  2 +-
 torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp       |  9 +++------
 torch_npu/csrc/aten/ops/ReluKernelNpu.cpp           |  2 +-
 torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp      |  2 +-
 torch_npu/csrc/aten/ops/SubKernelNpu.cpp            |  4 ++--
 torch_npu/csrc/aten/ops/SumKernelNpu.cpp            |  2 +-
 .../csrc/aten/ops/ThresholdBackwardKernelNpu.cpp    |  2 +-
 torch_npu/csrc/aten/ops/TopKKernelNpu.cpp           |  8 ++++----
 torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp      |  2 +-
 .../ops/convolution/Conv2dBackwardKernelNpu.cpp     |  8 ++++----
 .../ops/convolution/Conv3dBackwardKernelNpu.cpp     |  6 +++---
 .../ops/convolution/ConvTranspose2dKernelNpu.cpp    |  2 +-
 .../csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp |  2 +-
 torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp   |  4 ++--
 torch_npu/csrc/distributed/Init.cpp                 |  3 ++-
 torch_npu/csrc/distributed/reducer.cpp              | 13 +++++++------
 .../csrc/framework/contiguous/ContiguousOpt.cpp     |  3 ++-
 .../csrc/framework/contiguous/combined_opt.cpp      |  6 +++---
 torch_npu/csrc/framework/utils/NpuUtils.cpp         |  5 +++--
 torch_npu/csrc/framework/utils/OpPreparation.cpp    |  8 ++++++--
 35 files changed, 88 insertions(+), 82 deletions(-)

diff --git a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md
index 698b2ac1c7..9ca63313af 100644
--- a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md	
+++ b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md	
@@ -524,7 +524,8 @@ The following uses the torch.add\(\) operator as an example to describe how to a
           auto outputSize = broadcast_ops_npu_output_size(self, other);
         
           // construct the output tensor of the NPU
-          Tensor result = NPUNativeFunctions::empty_with_format(
+          at::Tensor result = (self, outputSize, npu_format);
+          Tensor result = OpPreparation::ApplyTensorWithFormat(
               outputSize,
               outputTensor.options(),
               CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -541,7 +542,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a
           // calculate the output size
           auto outputSize = input_same_output_size(self);
           // construct the output tensor of the NPU
-          Tensor result = NPUNativeFunctions::empty_with_format(
+          Tensor result = OpPreparation::ApplyTensorWithFormat(
               outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
         
           // calculate the output result of the NPU
diff --git "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
index d38aa6efd3..e2f6a2c9fa 100644
--- "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
+++ "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
@@ -524,7 +524,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。
           auto outputSize = broadcast_ops_npu_output_size(self, other);
         
           // construct the output tensor of the NPU
-          Tensor result = NPUNativeFunctions::empty_with_format(
+          Tensor result = OpPreparation::ApplyTensorWithFormat(
               outputSize,
               outputTensor.options(),
               CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -541,7 +541,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。
           // calculate the output size
           auto outputSize = input_same_output_size(self);
           // construct the output tensor of the NPU
-          Tensor result = NPUNativeFunctions::empty_with_format(
+          Tensor result = OpPreparation::ApplyTensorWithFormat(
               outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
         
           // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
index 506686f3c6..c518156b0a 100644
--- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
@@ -76,7 +76,7 @@ at::Tensor NPUNativeFunctions::npu_format_cast(
   TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half,
       "can not cast format when src is not float32 or float16");
 
-  at::Tensor dst = NPUNativeFunctions::empty_with_format(
+  at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
       src_desc.base_sizes_, src.options(), acl_format);
 
   // calculate the output result of the NPU
@@ -105,7 +105,7 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(
   TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half,
       "can not cast format when src is not float32 or float16");
 
-  at::Tensor dst = NPUNativeFunctions::empty_with_format(
+  at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
       src_desc.base_sizes_, src.options(), acl_format);
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index fdb4634709..ac32cb49bb 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -231,7 +231,7 @@ namespace at_npu
         {
           auto npu_format =
               self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_;
-          result = NPUNativeFunctions::empty_with_format(self.sizes(), self.options(), npu_format);
+          result = OpPreparation::ApplyTensorWithFormat(self.sizes(), self.options(), npu_format);
         }
       }
 
@@ -348,7 +348,7 @@ namespace at_npu
       options.layout(layout_opt);
       options.pinned_memory(pin_memory_opt);
       at::Tensor result =
-          NPUNativeFunctions::empty_with_format(size, options, dst_format);
+          OpPreparation::ApplyTensorWithFormat(size, options, dst_format);
       if (names.has_value())
       {
         internal_set_names_inplace(result, names);
@@ -362,7 +362,7 @@ namespace at_npu
                                      int64_t dst_format)
     {
       at::Tensor result =
-          NPUNativeFunctions::empty_with_format(size, options, dst_format);
+          OpPreparation::ApplyTensorWithFormat(size, options, dst_format);
       if (names.has_value())
       {
         internal_set_names_inplace(result, names);
@@ -377,7 +377,7 @@ namespace at_npu
                                           int64_t dst_format)
     {
       at::Tensor result =
-          NPUNativeFunctions::empty_with_format(size, options, dst_format);
+          OpPreparation::ApplyTensorWithFormat(size, options, dst_format);
       if (names.has_value())
       {
         internal_set_names_inplace(result, names);
@@ -628,7 +628,7 @@ namespace at_npu
       AT_ASSERT(result.is_contiguous());
       AT_DISPATCH_ALL_TYPES_AND_COMPLEX(result.scalar_type(), "tensor_npu", [&]
                                         { std::copy(
-                                            values.begin(), values.end(), result.template data_ptr<scalar_t>()); });
+                                              values.begin(), values.end(), result.template data_ptr<scalar_t>()); });
       return result;
     }
 
diff --git a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
index 08793a06e7..4d4fa126fb 100644
--- a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
@@ -170,7 +170,7 @@ namespace at_npu
       else
       {
         c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-        at::Tensor src_new = NPUNativeFunctions::empty_with_format(
+        at::Tensor src_new = OpPreparation::ApplyTensorWithFormat(
             src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
         src_new.set_(
             src.storage(),
@@ -206,7 +206,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(self, other);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -223,7 +223,7 @@ namespace at_npu
       // calculate the output size
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp
index 63b1271f93..c7a287bdab 100644
--- a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp
@@ -70,7 +70,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self, int64_t dim, bool kee
   auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim);
 
   // construct the output tensor of the NPU
-  at::Tensor result = NPUNativeFunctions::empty_with_format(
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
   // calculate the output result of the NPU  
@@ -88,7 +88,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self) {
   // when self's dim = 0, convert [1] tensor and reduce it
   if (self.dim() == 0) {
       at::Tensor self_tmp = self;
-      self_tmp = NPUNativeFunctions::empty_with_format(
+      self_tmp = OpPreparation::ApplyTensorWithFormat(
           {1}, 
           self.options().dtype(at::ScalarType::Float), 
           CalcuOpUtil::get_tensor_npu_format(self))
diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
index c93fbe3cc8..d1219aec6a 100644
--- a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
@@ -73,9 +73,9 @@ at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat
   // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去
   if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) &&
       !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) {
-    result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
+    result = OpPreparation::ApplyTensorWithFormat(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
   } else {
-    result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND);
+    result = OpPreparation::ApplyTensorWithFormat(outputSize, self.options(), ACL_FORMAT_ND);
   }
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp
index 5903959bab..f63a18da30 100644
--- a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp
@@ -53,7 +53,7 @@ namespace at_npu
         input = input.to(at::kInt);
       }
 
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           size,
           input.options(),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp
index 76865efccb..b7ca932113 100644
--- a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp
@@ -85,7 +85,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(self, other);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -102,7 +102,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           self.options(),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp
index 90b74193dc..fa27bc376a 100644
--- a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp
@@ -95,7 +95,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -113,7 +113,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -131,7 +131,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(c10::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
@@ -159,7 +159,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(c10::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp
index d27161e162..4b453091b6 100644
--- a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp
@@ -104,7 +104,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -121,7 +121,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -139,7 +139,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
@@ -167,7 +167,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
index 7ccd7c64a9..c7733a3ff5 100644
--- a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
@@ -100,7 +100,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensor(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool));
 
@@ -116,7 +116,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensor(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool));
 
@@ -133,7 +133,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
@@ -161,7 +161,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp
index 54a5768348..8ca851a36d 100644
--- a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp
@@ -142,7 +142,7 @@ namespace at_npu
       }
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options().dtype(dstType), npu_format);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
index 19aa9ccd83..0dafb37712 100644
--- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
@@ -183,12 +183,12 @@ Return:
 
       if ((self.scalar_type() == at::ScalarType::Half) && !c10::npu::OptionsManager::CheckSwitchMMOutputEnable())
       {
-        result = NPUNativeFunctions::empty_with_format(
+        result = OpPreparation::ApplyTensorWithFormat(
             outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
       }
       else
       {
-        result = NPUNativeFunctions::empty_with_format(outputSize, self.options());
+        result = OpPreparation::ApplyTensor(outputSize, self.options());
       }
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp
index c945f876a4..e6428df775 100644
--- a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp
@@ -114,7 +114,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(selfCast, otherCast);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -136,7 +136,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp
index 204a62f337..a3ae15a640 100644
--- a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp
@@ -50,7 +50,7 @@ namespace at_npu
     at::Tensor NPUNativeFunctions::neg(const at::Tensor &self)
     {
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
index a4edb6cad0..8343864e43 100644
--- a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
@@ -45,12 +45,9 @@ namespace at_npu
 
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(outputSize,
-                                                dtype_opt,
-                                                layout_opt,
-                                                device_opt,
-                                                pin_memory_opt,
-                                                CalcuOpUtil::get_tensor_npu_format(self));
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
+        outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt,
+        CalcuOpUtil::get_tensor_npu_format(self));
       // calculate the output result of the NPUc
       return NPUNativeFunctions::one_(result);
     }
diff --git a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
index 410f091e60..adf42ed6b9 100644
--- a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
@@ -65,7 +65,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
index ca6f03c9c7..9256fcc6c9 100644
--- a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
@@ -119,7 +119,7 @@ namespace at_npu
       outputSize[1] = c1_len.toInt() * 16;
 
       // construct the output at::Tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp
index 914664cd96..33ac4018ee 100644
--- a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp
@@ -103,7 +103,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(self, other);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -120,7 +120,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp
index 92012d2da5..23819bc687 100644
--- a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp
@@ -197,7 +197,7 @@ namespace at_npu
       }
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options().dtype(dstType), npu_format);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
index 3a4bdabee3..0bc0118a9c 100644
--- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
@@ -62,7 +62,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = NPUNativeFunctions::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // use 5HD in Relu
diff --git a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp
index 3f34261d8d..9a4bd8fb5a 100644
--- a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp
@@ -219,11 +219,11 @@ namespace at_npu
         // construct the output tensor of the NPU
         at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm);
         auto outputSize = transpose_npu_output_size(values, perm);
-        at::Tensor transposeValue = NPUNativeFunctions::empty_with_format(
+        at::Tensor transposeValue = OpPreparation::ApplyTensorWithFormat(
             outputSize,
             values.options(),
             CalcuOpUtil::get_tensor_npu_format(values));
-        at::Tensor transposeIndices = NPUNativeFunctions::empty_with_format(
+        at::Tensor transposeIndices = OpPreparation::ApplyTensorWithFormat(
             outputSize,
             indices.options(),
             CalcuOpUtil::get_tensor_npu_format(indices));
@@ -290,9 +290,9 @@ namespace at_npu
       // calculate the output size
       auto outputSize = topk_npu_output_size(selfCp, k, dim, largest, sorted);
       // construct the output tensor of the NPU
-      at::Tensor values = NPUNativeFunctions::empty_with_format(
+      at::Tensor values = OpPreparation::ApplyTensorWithFormat(
           outputSize, selfCp.options(), CalcuOpUtil::get_tensor_npu_format(selfCp));
-      at::Tensor indices = NPUNativeFunctions::empty_with_format(
+      at::Tensor indices = OpPreparation::ApplyTensorWithFormat(
           outputSize, selfCp.options().dtype(at::kInt), ACL_FORMAT_ND);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp
index 085c49f0fe..40631abc0b 100644
--- a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp
@@ -76,7 +76,7 @@ namespace at_npu
     {
       RECORD_FUNCTION("transpose_to_contiguous", vector<c10::IValue>({self}));
       int64_t self_format = CalcuOpUtil::get_tensor_npu_format(self);
-      at::Tensor result = NPUNativeFunctions::empty_with_format(self.sizes(), self.options(), self_format);
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(self.sizes(), self.options(), self_format);
 
       // obtain the transpose axises
       at::IntArrayRef dim;
diff --git a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp
index f6eb56eca5..b93bf1710d 100644
--- a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp
@@ -213,7 +213,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv2d_backwar
   at::Tensor gradBias;
   // construct the output tensor of the NPU
   if (grad_input_mask[0]) {
-    gradInput = NPUNativeFunctions::empty_with_format(
+    gradInput = OpPreparation::ApplyTensorWithFormat(
         std::get<0>(outputSizes), input.options(), ACL_FORMAT_NC1HWC0);
   }
 
@@ -221,12 +221,12 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv2d_backwar
     // For group conv2d: keep consistent with weight to avoid allreduce accuracy problem.
     // For more info: https://gitee.com/ascend/pytorch-develop/pulls/2255
     if (groups > 1) {
-      gradWeight = NPUNativeFunctions::empty_with_format(
+      gradWeight = OpPreparation::ApplyTensorWithFormat(
           std::get<1>(outputSizes),
           weight.options().dtype(at::kFloat),
           ACL_FORMAT_NCHW);      
     } else {
-      gradWeight = NPUNativeFunctions::empty_with_format(
+      gradWeight = OpPreparation::ApplyTensorWithFormat(
           std::get<1>(outputSizes),
           weight.options().dtype(at::kFloat),
           ACL_FORMAT_FRACTAL_Z);      
@@ -234,7 +234,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv2d_backwar
   }
 
   if (grad_input_mask[2]) {
-    gradBias = NPUNativeFunctions::empty_with_format(
+    gradBias = OpPreparation::ApplyTensorWithFormat(
         std::get<2>(outputSizes), grad.options(), ACL_FORMAT_NCHW);
   }
 
diff --git a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp
index f9ade9488c..b38ef864a0 100644
--- a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp
@@ -107,7 +107,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv3d_backwar
 
   if (grad_input_mask[0]) {
     // format should be NDC1HWC0
-    gradInput = NPUNativeFunctions::empty_with_format(
+    gradInput = OpPreparation::ApplyTensorWithFormat(
         input.sizes(), input.options(), ACL_FORMAT_NDC1HWC0);
 
     conv3d_backward_inputmask(
@@ -116,7 +116,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv3d_backwar
 
   if (grad_input_mask[1]) {
     // format should be FRACTAL_Z_3D
-    gradWeight = NPUNativeFunctions::empty_with_format(
+    gradWeight = OpPreparation::ApplyTensorWithFormat(
         weight.sizes(), weight.options().dtype(at::kFloat), ACL_FRACTAL_Z_3D);
 
     conv3d_backward_weightmask(
@@ -125,7 +125,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv3d_backwar
 
   if (grad_input_mask[2]) {
     // format should be NCHW, gradias.size = grad.size(1)
-    gradBias = NPUNativeFunctions::empty_with_format(
+    gradBias = OpPreparation::ApplyTensorWithFormat(
         {grad.size(1)}, grad.options(), ACL_FORMAT_NCHW);
 
     conv3d_backward_biasmask(
diff --git a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp
index 8e60437207..a0a32368fb 100644
--- a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp
@@ -81,7 +81,7 @@ at::Tensor NPUNativeFunctions::npu_conv_transpose2d(
 
   // construct the output tensor of the NPU
   at::Tensor result =
-      NPUNativeFunctions::empty_with_format(outputSize, input.options(), ACL_FORMAT_NC1HWC0);
+      OpPreparation::ApplyTensorWithFormat(outputSize, input.options(), ACL_FORMAT_NC1HWC0);
 
   // calculate the output result of the NPU
   conv_transpose2d_out_npu(
diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp
index e49f4ba62e..bf6da61f93 100644
--- a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp
@@ -91,7 +91,7 @@ at::Tensor NPUNativeFunctions::nll_loss_backward(
   auto outputSize = input_same_output_size(self);
 
   // construct the output tensor of the NPU
-  at::Tensor grad_input = NPUNativeFunctions::empty_with_format(
+  at::Tensor grad_input = OpPreparation::ApplyTensorWithFormat(
       outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp
index 418b2b296a..f274745813 100644
--- a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp
@@ -96,11 +96,11 @@ tuple<at::Tensor, at::Tensor> NPUNativeFunctions::nll_loss_forward(
       outputSize, totalWeightSize);
 
   // construct the output tensor of the NPU
-  at::Tensor result = NPUNativeFunctions::empty_with_format(
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       std::get<0>(outputSizes),
       self.options(),
       CalcuOpUtil::get_tensor_npu_format(self));
-  at::Tensor total_weight = NPUNativeFunctions::empty_with_format(
+  at::Tensor total_weight = OpPreparation::ApplyTensorWithFormat(
       std::get<1>(outputSizes),
       self.options(),
       CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 621ce5962a..e8717648c5 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -33,6 +33,7 @@
 #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "torch_npu/csrc/distributed/Init.h"
 #include "torch_npu/csrc/distributed/reducer.hpp"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 
 namespace torch_npu {
@@ -48,7 +49,7 @@ class BroadcastWork {
 public:
   inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors) {
     static auto cast_back_to_ori_format = [](const at::Tensor &t) { 
-      return NPUNativeFunctions::npu_format_cast(t, t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); 
+      return at_npu::native::NPUNativeFunctions::npu_format_cast(t, t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); 
       };
     return c10::fmap(tensors, cast_back_to_ori_format);
   }
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index 4a29c1ce45..5e9721944b 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -32,6 +32,7 @@
 
 #include "torch_npu/csrc/distributed/reducer.hpp"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 
 namespace c10d_npu {
 namespace {
@@ -407,9 +408,9 @@ void Reducer::copy_grad_to_bucket(
   if (comm_hook_ == nullptr) {
     // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp
     // Divides while copying into the bucket view.
-    NPUNativeFunctions::copy_memory_(bucket_view, grad.mul(float(1.) / divFactor_), true);
+    at_npu::native::NPUNativeFunctions::copy_memory_(bucket_view, grad.mul(float(1.) / divFactor_), true);
   } else {
-    NPUNativeFunctions::copy_memory_(bucket_view, grad, true);
+    at_npu::native::NPUNativeFunctions::copy_memory_(bucket_view, grad, true);
   }
 }
 
@@ -442,7 +443,7 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) {
         // make sure grad has the same format as variable
         if (grad.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ !=
               variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_) {
-          grad = NPUNativeFunctions::npu_format_cast(grad,
+          grad = at_npu::native::NPUNativeFunctions::npu_format_cast(grad,
               variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
         }
         this->copy_grad_to_bucket(grad, bucket_view);
@@ -1073,12 +1074,12 @@ void Reducer::copy_bucket_to_grad(
       if (!grad.defined()) {
         // Creates grad according to the "Gradient Layout Contract"
         // (see torch/csrc/grad/AccumulateGrad.h)
-        grad = NPUNativeFunctions::empty_with_format(
+        grad = OpPreparation::ApplyTensorWithFormat(
           variable.sizes(), bucket_view.options(),
           variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
-        NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
+        at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
       } else {
-        NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
+        at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
       }
       // The grad is modified and needs to be written back.
       return true;
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
index 9f918437bb..efc9671da6 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 
 namespace at_npu
 {
@@ -109,7 +110,7 @@ namespace at_npu
         const at::Tensor &src,
         const std::vector<string> &optimizations)
     {
-      auto self = NPUNativeFunctions::empty_with_format(
+      auto self = OpPreparation::ApplyTensorWithFormat(
           src.sizes(),
           src.options(),
           src.storage().get_npu_desc().npu_format_);
diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
index a0228c78ca..7614fb0075 100644
--- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
@@ -19,7 +19,7 @@
 
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 
 namespace at_npu
 {
@@ -389,7 +389,7 @@ namespace at_npu
         // baseInfo = inferred info(infer_size, infer_stride, infer_offset)
         // If the first inferred tensor can be optimized, store its info.
         if (can_infer_view_tensor(
-            src, temp_src, infer_size, infer_stride, infer_offset) &&
+                src, temp_src, infer_size, infer_stride, infer_offset) &&
             emplace_info(
                 temp_src, view_infos, view_offsets, infer_offset, max_len))
         {
@@ -500,7 +500,7 @@ namespace at_npu
           {
             // case 2: The first tensor is discontiguous-type,
             // conduct the standard optimization procedure.
-            auto contiguous_src = NPUNativeFunctions::empty_with_format(
+            auto contiguous_src = OpPreparation::ApplyTensorWithFormat(
                 src.sizes(),
                 src.options(),
                 src.storage().get_npu_desc().npu_format_);
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp
index f10640a1ec..07bda7c6a6 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.cpp
+++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp
@@ -25,6 +25,7 @@
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/interface/EnvVariables.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 
 namespace at_npu
 {
@@ -165,7 +166,7 @@ namespace at_npu
       // 3. get output size
       auto outputSize = index_select_npu_output_size(src_tmp, dim, index);
       int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(src_tmp);
-      at::Tensor result = NPUNativeFunctions::empty_with_format(outputSize, src_tmp.options(), npu_format);
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(outputSize, src_tmp.options(), npu_format);
       // std::cout << "npu_format: " << npu_format << std::endl;
 
       // 4. get input and output
@@ -208,7 +209,7 @@ namespace at_npu
     at::Tensor deal_with_5d_5d_match(const at::Tensor &src)
     {
       auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-      at::Tensor src_new = NPUNativeFunctions::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
+      at::Tensor src_new = OpPreparation::ApplyTensorWithFormat(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
       c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream();
       int64_t numel = src_new.numel();
       aclError error = aclrtMemcpyAsync(
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index 1001fa0167..d17c46d3b0 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -236,13 +236,17 @@ namespace at_npu
     at::Tensor OpPreparation::ApplyTensorWithFormat(c10::IntArrayRef sizes, const c10::TensorOptions &options, int64_t format)
     {
       auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format);
-      return NPUNativeFunctions::empty_with_format(sizes, options, fixFormat);
+      return NPUNativeFunctions::empty_with_format(
+        sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(),
+        options.device_opt(), options.pinned_memory_opt(), fixFormat);
     }
 
     at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10::TensorOptions &options)
     {
       auto format = InferFormat::GuessBaseFormat(sizes);
-      return NPUNativeFunctions::empty_with_format(sizes, options, format);
+      return NPUNativeFunctions::empty_with_format(
+        sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(),
+        options.device_opt(), options.pinned_memory_opt(), fixFormat);
     }
 
     void OpPreparation::CheckMemory(const std::initializer_list<at::Tensor> &inputs, const std::initializer_list<at::Tensor> &outputs)
-- 
Gitee


From c374454b4b6a738049f4831a10723f738b6cf192 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Sat, 12 Feb 2022 18:22:07 +0800
Subject: [PATCH 06/12] Fix modification for const vector.

---
 torch_npu/csrc/aten/common/CopyKernel.cpp            | 4 ++--
 torch_npu/csrc/aten/common/FormatCastHelper.cpp      | 2 +-
 torch_npu/csrc/aten/common/FormatCastHelper.h        | 2 +-
 torch_npu/csrc/aten/common/NpuFastReshape.cpp        | 1 +
 torch_npu/csrc/aten/ops/LtKernelNpu.cpp              | 4 ++--
 torch_npu/csrc/aten/ops/MmKernelNpu.cpp              | 2 +-
 torch_npu/csrc/aten/ops/ReluKernelNpu.cpp            | 2 +-
 torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp | 7 ++++---
 torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp       | 1 +
 torch_npu/csrc/distributed/reducer.cpp               | 2 +-
 torch_npu/csrc/framework/utils/OpPreparation.cpp     | 4 ++--
 11 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp
index bb983a63d3..171ab3f9c9 100644
--- a/torch_npu/csrc/aten/common/CopyKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernel.cpp
@@ -312,7 +312,7 @@ void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
   if (!FormatHelper::IsBaseFormatType(self)) {
     at::Tensor dst = OpPreparation::ApplyTensor(self);
     copy_h2d_baseformat(dst, src, non_blocking, true);
-    NPUNativeFunctions::npu_format_cast_(dst);
+    NPUNativeFunctions::npu_format_cast_(self, dst);
     return;
   }
   copy_h2d_baseformat(self, src, non_blocking);
@@ -363,7 +363,7 @@ void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking)
     }
     at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self);
     copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking);
-    NPUNativeFunctions::npu_format_cast_(dst_4D);
+    NPUNativeFunctions::npu_format_cast_(self, dst_4D);
     return;
   }
   copy_d2d_dtype_format(self, src, non_blocking);
diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
index 2ee080d215..13d82c3f5f 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
@@ -26,7 +26,7 @@ bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor&
   return FormatHelper::GetBaseFormat(src_format) == FormatHelper::GetBaseFormat(dst_format);
 }
 
-void FormatCastHelper::base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src) {
+void FormatCastHelper::base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src) {
   dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
   NPUNativeFunctions::copy_memory_(dst, src, true);
 }
diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.h b/torch_npu/csrc/aten/common/FormatCastHelper.h
index 91e9b78182..ea2b6ab507 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.h
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.h
@@ -33,7 +33,7 @@ public:
   static at::Tensor& CovertSelfToBaseFormat(at::Tensor& src);
 private:
   // help function of format_cast_between_group
-  static void base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src);
+  static void base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src);
 }; // class FormatCastHelper
 
 } // namespace native
diff --git a/torch_npu/csrc/aten/common/NpuFastReshape.cpp b/torch_npu/csrc/aten/common/NpuFastReshape.cpp
index 9fc817bfe3..e7df99dc6a 100644
--- a/torch_npu/csrc/aten/common/NpuFastReshape.cpp
+++ b/torch_npu/csrc/aten/common/NpuFastReshape.cpp
@@ -17,6 +17,7 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace at_npu {
 namespace native {
diff --git a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
index c7733a3ff5..1503d75a52 100644
--- a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
@@ -100,7 +100,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = OpPreparation::ApplyTensor(
+      at::Tensor result = OpPreparation::ApplyTensorWithSizes(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool));
 
@@ -116,7 +116,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = OpPreparation::ApplyTensor(
+      at::Tensor result = OpPreparation::ApplyTensorWithSizes(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool));
 
diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
index 0dafb37712..8ef6a11497 100644
--- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
@@ -188,7 +188,7 @@ Return:
       }
       else
       {
-        result = OpPreparation::ApplyTensor(outputSize, self.options());
+        result = OpPreparation::ApplyTensorWithSizes(outputSize, self.options());
       }
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
index adf42ed6b9..1f4331abc0 100644
--- a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
@@ -17,7 +17,7 @@
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
-
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp
index 968e2a6419..bb0050fe18 100644
--- a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp
@@ -52,17 +52,18 @@ namespace at_npu
       // calculate the output size
       auto outputSize = input_same_output_size(grad_output);
 
+      at::Tensor tmp_output = output;
       // output'format must be same with grad_output
-      if (CalcuOpUtil::get_tensor_npu_format(output) != CalcuOpUtil::get_tensor_npu_format(grad_output))
+      if (CalcuOpUtil::get_tensor_npu_format(tmp_output) != CalcuOpUtil::get_tensor_npu_format(grad_output))
       {
-        NPUNativeFunctions::npu_format_cast_(output, CalcuOpUtil::get_tensor_npu_format(grad_output));
+        NPUNativeFunctions::npu_format_cast_(tmp_output, CalcuOpUtil::get_tensor_npu_format(grad_output));
       }
 
       // construct the output tensor of the NPU
       at::Tensor grad_input = OpPreparation::ApplyTensor(grad_output, outputSize);
 
       // calculate the output result of the NPU
-      softmax_backward_out_npu(grad_input, grad_output, output, dim, self);
+      softmax_backward_out_npu(grad_input, grad_output, tmp_output, dim, self);
 
       return grad_input;
     }
diff --git a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
index 9256fcc6c9..eb85dfefab 100644
--- a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
@@ -17,6 +17,7 @@
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace at_npu
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index 5e9721944b..4f05619d18 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -1074,7 +1074,7 @@ void Reducer::copy_bucket_to_grad(
       if (!grad.defined()) {
         // Creates grad according to the "Gradient Layout Contract"
         // (see torch/csrc/grad/AccumulateGrad.h)
-        grad = OpPreparation::ApplyTensorWithFormat(
+        grad = at_npu::native::OpPreparation::ApplyTensorWithFormat(
           variable.sizes(), bucket_view.options(),
           variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
         at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index d17c46d3b0..e0ade98c71 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -199,7 +199,7 @@ namespace at_npu
     at::Tensor &OpPreparation::CastBackToOriFormat(at::Tensor &tensor)
     {
       auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_;
-      tensor.npu_format_cast_(tensor_desc.origin_format_);
+      NPUNativeFunctions::npu_format_cast_(tensor, tensor_desc.origin_format_);
       return tensor;
     }
 
@@ -246,7 +246,7 @@ namespace at_npu
       auto format = InferFormat::GuessBaseFormat(sizes);
       return NPUNativeFunctions::empty_with_format(
         sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(),
-        options.device_opt(), options.pinned_memory_opt(), fixFormat);
+        options.device_opt(), options.pinned_memory_opt(), format);
     }
 
     void OpPreparation::CheckMemory(const std::initializer_list<at::Tensor> &inputs, const std::initializer_list<at::Tensor> &outputs)
-- 
Gitee


From c4b424d00e4c0c636ab45841ebc87bcb701c0760 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Mon, 14 Feb 2022 10:13:41 +0800
Subject: [PATCH 07/12] Add Module & LayerNorm.

---
 torch_npu/__init__.py       |  26 +++++----
 torch_npu/utils/__init__.py |  36 +++++++++++++
 torch_npu/utils/module.py   | 105 ++++++++++++++++++++++++++++++++++++
 3 files changed, 157 insertions(+), 10 deletions(-)
 create mode 100644 torch_npu/utils/__init__.py
 create mode 100644 torch_npu/utils/module.py

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index cedd54404f..5423b568e9 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -24,6 +24,8 @@ import torch_npu.npu.amp
 import torch_npu.distributed
 import torch_npu._C
 
+from torch_npu.utils import nn_monkey_patches
+
 from .version import __version__ as __version__
 
 __all__ = []
@@ -35,16 +37,20 @@ for name in dir(torch_npu._C._VariableFunctions):
     globals()[name] = getattr(torch_npu._C._VariableFunctions, name)
     __all__.append(name)
 
+all_monkey_patches = [
+    ["npu", torch_npu.npu],
+    ["npu.amp", torch_npu.npu.amp],
+    ["autograd.profiler", torch_npu.npu.profiler],
+    ["distributed", torch_npu.distributed],
+    ["distributed.distributed_c10d", torch_npu.distributed.distributed_c10d],
+    ["nn.parallel.distributed._get_default_group", torch_npu.distributed.distributed_c10d._get_default_group]
+]
+
+all_monkey_patches += nn_monkey_patches
+
 
-def _apply_patches():
-    monkey_patches = [
-        ["npu", torch_npu.npu],
-        ["npu.amp", torch_npu.npu.amp],
-        ["autograd.profiler", torch_npu.npu.profiler],
-        ["distributed", torch_npu.distributed],
-        ["distributed.distributed_c10d", torch_npu.distributed.distributed_c10d],
-        ["nn.parallel.distributed._get_default_group", torch_npu.distributed.distributed_c10d._get_default_group]
-    ]
+def _apply_patches(monkey_patches):
+    
     def _getattr(module_list, root_module=torch):
         if len(module_list) <= 1:
             return root_module
@@ -76,7 +82,7 @@ def _apply_patches():
             setattr(dest_module, attr, getattr(patch, attr))
 
 # Apply monkey-patches.
-_apply_patches()
+_apply_patches(all_monkey_patches)
 
 # NPU exit, need to synchronize devices
 def _npu_shutdown():
diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py
new file mode 100644
index 0000000000..ed6f2abac7
--- /dev/null
+++ b/torch_npu/utils/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .module import LayerNorm, Module
+
+
+def _get_monkey_patches():
+    nn_modules = ["activation", "adaptive", "batchnorm", "channelshuffle", "container",
+                  "conv", "distance", "dropout", "flatten", "fold", "instancenorm",
+                  "linear", "loss", "module", "normalization", "padding", "pixelshuffle",
+                  "pooling", "rnn", "sparse", "transformer", "upsampling"]
+    _monkey_patches = []
+    for module_name in nn_modules:
+        _monkey_patches.append([f"nn.modules.{module_name}.Module", Module])
+
+    _monkey_patches.append(["nn.Module", Module])
+    _monkey_patches.append(["nn.modules.Module", Module])
+    _monkey_patches.append(["nn.modules.normalization.LayerNorm", LayerNorm])
+    _monkey_patches.append(["nn.modules.LayerNorm", LayerNorm])
+    _monkey_patches.append(["nn.LayerNorm", LayerNorm])
+    return _monkey_patches
+
+
+nn_monkey_patches = _get_monkey_patches()
diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py
new file mode 100644
index 0000000000..c1918ab4c7
--- /dev/null
+++ b/torch_npu/utils/module.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+import torch_npu
+
+
+class Module(torch.nn.Module):
+
+    def npu(self, device=None):
+        r"""Moves all model parameters and buffers to the npu.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on npu while being optimized.
+
+        Arguments:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        if device is None:
+            device = torch.device("npu")
+        if torch.npu.is_available():
+            with torch.no_grad():
+                self.cast_weight(device)
+        return self._apply(lambda t: t.npu(device))
+
+
+    def to(self, *args, **kwargs):
+        super(Module, self).to(*args, **args)
+        device, _, _, _ = torch._C._nn._parse_to(*args, **kwargs)
+        if torch.npu.is_available():
+            with torch.no_grad():
+                self.cast_weight(device)
+
+    def cast_weight(self, device):
+        if device is None:
+            return
+
+        if "npu" not in str(device):
+            return
+
+        current_class = self.__class__
+        if issubclass(current_class, torch.nn.Linear):
+            self.weight.data = self.weight.data.to(device)
+            self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) #ACL_FORMAT_FRACTAL_NZ
+        elif issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)):
+            if self.affine == True:
+                self.weight.data = self.weight.data.to(device)
+                self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3)  #ACL_FORMAT_NC1HWC0
+                self.bias.data = self.bias.data.to(device)
+                self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3)
+            self.running_mean.data = self.running_mean.data.to(device)
+            self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3)
+            self.running_var.data = self.running_var.data.to(device)
+            self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3)
+        elif issubclass(current_class, torch.nn.Conv2d):
+            if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0):
+                return
+            self.weight.data = self.weight.data.to(device)
+            self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4)  #ACL_FORMAT_FRACTAL_Z
+        elif issubclass(current_class, torch.nn.Conv3d):
+            self.weight.data = self.weight.data.to(device)
+            self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float()  #ACL_FRACTAL_Z_3D
+        elif ("MultiheadAttention" in str(current_class)):
+            if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \
+               hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \
+               hasattr(self,"v_proj_weight") and self.v_proj_weight is not None:
+                self.q_proj_weight.data = self.q_proj_weight.data.to(device)
+                self.q_proj_weight.data = torch_npu.npu_format_cast(self.q_proj_weight.data, 29)
+                self.k_proj_weight.data = self.k_proj_weight.data.to(device)
+                self.k_proj_weight.data = torch_npu.npu_format_cast(self.k_proj_weight.data, 29)
+                self.v_proj_weight.data = self.v_proj_weight.data.to(device)
+                self.v_proj_weight.data = torch_npu.npu_format_cast(self.v_proj_weight.data, 29)
+
+        if self.children() is not None:
+            for sub_module in self.children():
+                if isinstance(sub_module, Module):
+                    sub_module.cast_weight(device)
+
+
+class LayerNorm(torch.nn.LayerNorm):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.training:
+            return torch.nn.functional.layer_norm(
+                input, self.normalized_shape, self.weight, self.bias, self.eps)
+        else:
+            return torch_npu.npu_layer_norm_eval(input, self.normalized_shape, self.weight, self.bias, self.eps)
-- 
Gitee


From 6ebe3ad2cfba72ecabe099317552046030ec18d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Mon, 14 Feb 2022 11:04:01 +0800
Subject: [PATCH 08/12] Add module patch.

---
 torch_npu/__init__.py       |   3 +-
 torch_npu/utils/__init__.py |  11 +--
 torch_npu/utils/module.py   | 173 ++++++++++++++++++++----------------
 3 files changed, 101 insertions(+), 86 deletions(-)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 5423b568e9..57b6be74d6 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -24,7 +24,7 @@ import torch_npu.npu.amp
 import torch_npu.distributed
 import torch_npu._C
 
-from torch_npu.utils import nn_monkey_patches
+from torch_npu.utils import nn_monkey_patches, apply_module_patch
 
 from .version import __version__ as __version__
 
@@ -83,6 +83,7 @@ def _apply_patches(monkey_patches):
 
 # Apply monkey-patches.
 _apply_patches(all_monkey_patches)
+apply_module_patch()
 
 # NPU exit, need to synchronize devices
 def _npu_shutdown():
diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py
index ed6f2abac7..092f9cebfb 100644
--- a/torch_npu/utils/__init__.py
+++ b/torch_npu/utils/__init__.py
@@ -13,20 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .module import LayerNorm, Module
+from .module import LayerNorm, apply_module_patch
 
 
 def _get_monkey_patches():
-    nn_modules = ["activation", "adaptive", "batchnorm", "channelshuffle", "container",
-                  "conv", "distance", "dropout", "flatten", "fold", "instancenorm",
-                  "linear", "loss", "module", "normalization", "padding", "pixelshuffle",
-                  "pooling", "rnn", "sparse", "transformer", "upsampling"]
     _monkey_patches = []
-    for module_name in nn_modules:
-        _monkey_patches.append([f"nn.modules.{module_name}.Module", Module])
-
-    _monkey_patches.append(["nn.Module", Module])
-    _monkey_patches.append(["nn.modules.Module", Module])
     _monkey_patches.append(["nn.modules.normalization.LayerNorm", LayerNorm])
     _monkey_patches.append(["nn.modules.LayerNorm", LayerNorm])
     _monkey_patches.append(["nn.LayerNorm", LayerNorm])
diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py
index c1918ab4c7..5620d851c2 100644
--- a/torch_npu/utils/module.py
+++ b/torch_npu/utils/module.py
@@ -13,86 +13,109 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import warnings
 import torch
 import torch_npu
 
 
-class Module(torch.nn.Module):
-
-    def npu(self, device=None):
-        r"""Moves all model parameters and buffers to the npu.
-
-        This also makes associated parameters and buffers different objects. So
-        it should be called before constructing optimizer if the module will
-        live on npu while being optimized.
-
-        Arguments:
-            device (int, optional): if specified, all parameters will be
-                copied to that device
-
-        Returns:
-            Module: self
-        """
-        if device is None:
-            device = torch.device("npu")
-        if torch.npu.is_available():
-            with torch.no_grad():
-                self.cast_weight(device)
-        return self._apply(lambda t: t.npu(device))
-
-
-    def to(self, *args, **kwargs):
-        super(Module, self).to(*args, **args)
-        device, _, _, _ = torch._C._nn._parse_to(*args, **kwargs)
-        if torch.npu.is_available():
-            with torch.no_grad():
-                self.cast_weight(device)
-
-    def cast_weight(self, device):
-        if device is None:
-            return
-
-        if "npu" not in str(device):
-            return
-
-        current_class = self.__class__
-        if issubclass(current_class, torch.nn.Linear):
+def npu(self, device=None):
+    r"""Moves all model parameters and buffers to the npu.
+
+    This also makes associated parameters and buffers different objects. So
+    it should be called before constructing optimizer if the module will
+    live on npu while being optimized.
+
+    Arguments:
+        device (int, optional): if specified, all parameters will be
+            copied to that device
+
+    Returns:
+        Module: self
+    """
+    if device is None:
+        device = torch.device("npu")
+    if torch_npu.npu.is_available():
+        with torch.no_grad():
+            self.cast_weight(device)
+    return self._apply(lambda t: t.npu(device))
+
+
+def to(self, *args, **kwargs):
+    device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+
+    if dtype is not None:
+        if not (dtype.is_floating_point or dtype.is_complex):
+            raise TypeError('nn.Module.to only accepts floating point or complex '
+                            'dtypes, but got desired dtype={}'.format(dtype))
+        if dtype.is_complex:
+            warnings.warn(
+                "Complex modules are a new feature under active development whose design may change, "
+                "and some modules might not work as expected when using complex tensors as parameters or buffers. "
+                "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md "
+                "if a complex module does not work as expected.")
+    if torch_npu.npu.is_available():
+        with torch.no_grad():
+            self.cast_weight(device)
+
+    def convert(t):
+        if convert_to_format is not None and t.dim() == 4:
+            return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
+                        non_blocking, memory_format=convert_to_format)
+        return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
+
+    return self._apply(convert)
+
+
+def cast_weight(self, device):
+    if device is None:
+        return
+
+    if "npu" not in str(device):
+        return
+
+    current_class = self.__class__
+    if issubclass(current_class, torch.nn.Linear):
+        self.weight.data = self.weight.data.to(device)
+        self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) #ACL_FORMAT_FRACTAL_NZ
+    elif issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)):
+        if self.affine == True:
             self.weight.data = self.weight.data.to(device)
-            self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) #ACL_FORMAT_FRACTAL_NZ
-        elif issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)):
-            if self.affine == True:
-                self.weight.data = self.weight.data.to(device)
-                self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3)  #ACL_FORMAT_NC1HWC0
-                self.bias.data = self.bias.data.to(device)
-                self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3)
-            self.running_mean.data = self.running_mean.data.to(device)
-            self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3)
-            self.running_var.data = self.running_var.data.to(device)
-            self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3)
-        elif issubclass(current_class, torch.nn.Conv2d):
-            if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0):
-                return
-            self.weight.data = self.weight.data.to(device)
-            self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4)  #ACL_FORMAT_FRACTAL_Z
-        elif issubclass(current_class, torch.nn.Conv3d):
-            self.weight.data = self.weight.data.to(device)
-            self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float()  #ACL_FRACTAL_Z_3D
-        elif ("MultiheadAttention" in str(current_class)):
-            if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \
-               hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \
-               hasattr(self,"v_proj_weight") and self.v_proj_weight is not None:
-                self.q_proj_weight.data = self.q_proj_weight.data.to(device)
-                self.q_proj_weight.data = torch_npu.npu_format_cast(self.q_proj_weight.data, 29)
-                self.k_proj_weight.data = self.k_proj_weight.data.to(device)
-                self.k_proj_weight.data = torch_npu.npu_format_cast(self.k_proj_weight.data, 29)
-                self.v_proj_weight.data = self.v_proj_weight.data.to(device)
-                self.v_proj_weight.data = torch_npu.npu_format_cast(self.v_proj_weight.data, 29)
-
-        if self.children() is not None:
-            for sub_module in self.children():
-                if isinstance(sub_module, Module):
-                    sub_module.cast_weight(device)
+            self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3)  #ACL_FORMAT_NC1HWC0
+            self.bias.data = self.bias.data.to(device)
+            self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3)
+        self.running_mean.data = self.running_mean.data.to(device)
+        self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3)
+        self.running_var.data = self.running_var.data.to(device)
+        self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3)
+    elif issubclass(current_class, torch.nn.Conv2d):
+        if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0):
+            return
+        self.weight.data = self.weight.data.to(device)
+        self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4)  #ACL_FORMAT_FRACTAL_Z
+    elif issubclass(current_class, torch.nn.Conv3d):
+        self.weight.data = self.weight.data.to(device)
+        self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float()  #ACL_FRACTAL_Z_3D
+    elif ("MultiheadAttention" in str(current_class)):
+        if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \
+            hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \
+            hasattr(self,"v_proj_weight") and self.v_proj_weight is not None:
+            self.q_proj_weight.data = self.q_proj_weight.data.to(device)
+            self.q_proj_weight.data = torch_npu.npu_format_cast(self.q_proj_weight.data, 29)
+            self.k_proj_weight.data = self.k_proj_weight.data.to(device)
+            self.k_proj_weight.data = torch_npu.npu_format_cast(self.k_proj_weight.data, 29)
+            self.v_proj_weight.data = self.v_proj_weight.data.to(device)
+            self.v_proj_weight.data = torch_npu.npu_format_cast(self.v_proj_weight.data, 29)
+
+    if self.children() is not None:
+        for sub_module in self.children():
+            if isinstance(sub_module, torch.nn.Module):
+                sub_module.cast_weight(device)
+
+
+def apply_module_patch():
+    torch.nn.Module.npu = npu
+    torch.nn.Module.to = to
+    torch.nn.Module.cast_weight = cast_weight
 
 
 class LayerNorm(torch.nn.LayerNorm):
-- 
Gitee


From 61c740f02e1938c3ff153fb8cfb0a4579be1c982 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Mon, 14 Feb 2022 11:43:45 +0800
Subject: [PATCH 09/12] Update custom ops calling.

---
 .../test_batchnorm_gather_stats_with_counts.py                | 4 ++--
 test/test_network_ops/test_uniform_.py                        | 2 +-
 torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp               | 4 ++--
 torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp                    | 4 ++--
 torch_npu/csrc/aten/ops/IndexKernelNpu.cpp                    | 2 +-
 torch_npu/csrc/aten/ops/WhereKernelNpu.cpp                    | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py
index 7f9e5e4d0a..52585e2311 100644
--- a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py
+++ b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py
@@ -47,7 +47,7 @@ class TestBatchNormGatherStatsWithCounts(TestCase):
         input1 = np.array(data).astype(dtype)
         npu_counts = torch.from_numpy(input1).to("npu:0")
         if npu_format != -1:
-            npu_counts = npu_counts.npu_format_cast(npu_format)
+            npu_counts = torch_npu.npu_format_cast(npu_counts, npu_format)
         return npu_counts
 
     def create_counts_tensor16(self, item):
@@ -58,7 +58,7 @@ class TestBatchNormGatherStatsWithCounts(TestCase):
         input1 = np.array(data).astype(dtype)
         npu_counts = torch.from_numpy(input1).to("npu:0")
         if npu_format != -1:
-            npu_counts = npu_counts.npu_format_cast(npu_format)
+            npu_counts = torch_npu.npu_format_cast(npu_counts, npu_format)
         return npu_counts
 
     def test_batch_norm_gather_stats_with_counts(self, device):
diff --git a/test/test_network_ops/test_uniform_.py b/test/test_network_ops/test_uniform_.py
index 893adf140e..de4a3a9669 100644
--- a/test/test_network_ops/test_uniform_.py
+++ b/test/test_network_ops/test_uniform_.py
@@ -39,7 +39,7 @@ class TestUniform(TestCase):
 
         for item in shape_format:
             input1 = torch.zeros(item[0], dtype=item[3]).npu()
-            input1.npu_format_cast(3)
+            input1 = torch_npu.npu_format_cast(input1, 3)
             input1.uniform_(item[1], item[2])
             self.assertTrue(item[1] <= input1.min())
             self.assertTrue(item[2] >= input1.max())
diff --git a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp
index 2e5df37da0..fa8b070662 100644
--- a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp
@@ -112,7 +112,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, const at::Ten
   auto outputSize = broadcast_ops_npu_output_size(self, other);
 
   // construct the output at::Tensor of the NPU
-  at::Tensor result = at::empty_with_format(
+  at::Tensor result = NPUNativeFunctions::empty_with_format(
       outputSize,
       ref_tensor.options(),
       CalcuOpUtil::get_tensor_npu_format(ref_tensor));
@@ -128,7 +128,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, at::Scalar ot
   auto outputSize = input_same_output_size(self);
 
   // construct the output at::Tensor of the NPU
-  at::Tensor result = at::empty_with_format(
+  at::Tensor result = NPUNativeFunctions::empty_with_format(
       outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp
index 2f696eb0ad..ea5bf340cc 100644
--- a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp
@@ -83,9 +83,9 @@ at::Tensor pure_bmm_v2_npu(const at::Tensor& self, const at::Tensor& mat2, const
   at::Tensor result;
 
   if ((tensor1.scalar_type() == at::ScalarType::Half)) {
-    result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ);
+    result = NPUNativeFunctions::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ);
   } else {
-    result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND);
+    result = NPUNativeFunctions::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND);
   }
 
   at::Tensor contiguous_self = tensor1;
diff --git a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
index 40bf84de52..1aa6ffb9dc 100644
--- a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
@@ -42,7 +42,7 @@ at::Tensor NPUNativeFunctions::index(const at::Tensor& self, const torch::List<c
   at::native::checkIndexTensorTypes(orig);
   // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
   auto indices = at::native::expandTensors(self, orig);
-  at::Tensor formatCastOfSelf = self.npu_format_cast(ACL_FORMAT_ND);
+  at::Tensor formatCastOfSelf = NPUNativeFunctions::npu_format_cast(self, ACL_FORMAT_ND);
 
   // calculate the output size
   auto outputSize = index_npu_output_size(formatCastOfSelf, indices);
diff --git a/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp b/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp
index edb450c9cf..8976e4f1b1 100644
--- a/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp
@@ -92,7 +92,7 @@ vector<at::Tensor> NPUNativeFunctions::where(const at::Tensor& condition) {
   at::Tensor formatCastOfCondition = condition;
   if (condition.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ !=
     ACL_FORMAT_ND) {
-    formatCastOfCondition = formatCastOfCondition.npu_format_cast(ACL_FORMAT_ND);
+    formatCastOfCondition = NPUNativeFunctions::npu_format_cast(formatCastOfCondition, ACL_FORMAT_ND);
   }
   if (condition.scalar_type() == at::ScalarType::Half) {
     formatCastOfCondition = NPUNativeFunctions::npu_dtype_cast(formatCastOfCondition, at::ScalarType::Float);
-- 
Gitee


From e39ce7f8716f58939a86851d2a1af9efd84c0b67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Mon, 14 Feb 2022 12:09:30 +0800
Subject: [PATCH 10/12] Replace with ApplyTensor.

---
 torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp | 4 ++--
 torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp
index fa8b070662..9e42f4f0ef 100644
--- a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp
@@ -112,7 +112,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, const at::Ten
   auto outputSize = broadcast_ops_npu_output_size(self, other);
 
   // construct the output at::Tensor of the NPU
-  at::Tensor result = NPUNativeFunctions::empty_with_format(
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       outputSize,
       ref_tensor.options(),
       CalcuOpUtil::get_tensor_npu_format(ref_tensor));
@@ -128,7 +128,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, at::Scalar ot
   auto outputSize = input_same_output_size(self);
 
   // construct the output at::Tensor of the NPU
-  at::Tensor result = NPUNativeFunctions::empty_with_format(
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp
index ea5bf340cc..f928e97738 100644
--- a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp
@@ -83,9 +83,9 @@ at::Tensor pure_bmm_v2_npu(const at::Tensor& self, const at::Tensor& mat2, const
   at::Tensor result;
 
   if ((tensor1.scalar_type() == at::ScalarType::Half)) {
-    result = NPUNativeFunctions::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ);
+    result = OpPreparation::ApplyTensorWithFormat(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ);
   } else {
-    result = NPUNativeFunctions::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND);
+    result = OpPreparation::ApplyTensorWithFormat(output_size, tensor1.options(), ACL_FORMAT_ND);
   }
 
   at::Tensor contiguous_self = tensor1;
-- 
Gitee


From 083ffb9385ed7795724f1370db7fffb000d9fb68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Mon, 14 Feb 2022 12:47:36 +0800
Subject: [PATCH 11/12] Fix codecheck.

---
 torch_npu/__init__.py                         |  2 +-
 .../csrc/aten/common/TensorFactories.cpp      |  2 +-
 torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp |  4 ++--
 .../csrc/aten/ops/ZerosLikeKernelNpu.cpp      |  4 ++--
 torch_npu/csrc/distributed/reducer.cpp        |  4 ++--
 .../framework/contiguous/combined_opt.cpp     |  2 +-
 .../csrc/framework/utils/OpPreparation.cpp    |  8 +++----
 torch_npu/utils/module.py                     | 23 ++++++++-----------
 8 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 57b6be74d6..de976de730 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -60,7 +60,7 @@ def _apply_patches(monkey_patches):
         else:
             empty_module_name = f'{root_module.__name__}.{module_list[0]}'
             sys.modules[empty_module_name] = types.ModuleType(empty_module_name)
-            setattr(root_module, module_list[0], sys.modules[empty_module_name])
+            setattr(root_module, module_list[0], sys.modules.get(empty_module_name))
             return _getattr(module_list[1:], getattr(root_module, module_list[0]))
 
     for patch_pair in monkey_patches:
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index ac32cb49bb..c2a9ae6eab 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -628,7 +628,7 @@ namespace at_npu
       AT_ASSERT(result.is_contiguous());
       AT_DISPATCH_ALL_TYPES_AND_COMPLEX(result.scalar_type(), "tensor_npu", [&]
                                         { std::copy(
-                                              values.begin(), values.end(), result.template data_ptr<scalar_t>()); });
+                                            values.begin(), values.end(), result.template data_ptr<scalar_t>()); });
       return result;
     }
 
diff --git a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
index 8343864e43..4e65798b22 100644
--- a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
@@ -46,8 +46,8 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
       at::Tensor result = NPUNativeFunctions::empty_with_format(
-        outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt,
-        CalcuOpUtil::get_tensor_npu_format(self));
+          outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt,
+          CalcuOpUtil::get_tensor_npu_format(self));
       // calculate the output result of the NPUc
       return NPUNativeFunctions::one_(result);
     }
diff --git a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp
index 2f4775751d..8147622a01 100644
--- a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp
@@ -57,8 +57,8 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
       at::Tensor result = NPUNativeFunctions::empty_with_format(
-        outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt,
-        CalcuOpUtil::get_tensor_npu_format(self));
+          outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt,
+          CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
       return result.zero_();
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index 4f05619d18..81f7f04968 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -1075,8 +1075,8 @@ void Reducer::copy_bucket_to_grad(
         // Creates grad according to the "Gradient Layout Contract"
         // (see torch/csrc/grad/AccumulateGrad.h)
         grad = at_npu::native::OpPreparation::ApplyTensorWithFormat(
-          variable.sizes(), bucket_view.options(),
-          variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
+            variable.sizes(), bucket_view.options(),
+            variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
         at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
       } else {
         at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
index 68c92e048b..1ce04a827f 100644
--- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
@@ -389,7 +389,7 @@ namespace at_npu
         // baseInfo = inferred info(infer_size, infer_stride, infer_offset)
         // If the first inferred tensor can be optimized, store its info.
         if (can_infer_view_tensor(
-                src, temp_src, infer_size, infer_stride, infer_offset) &&
+            src, temp_src, infer_size, infer_stride, infer_offset) &&
             emplace_info(
                 temp_src, view_infos, view_offsets, infer_offset, max_len))
         {
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index e0ade98c71..3726a9765a 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -237,16 +237,16 @@ namespace at_npu
     {
       auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format);
       return NPUNativeFunctions::empty_with_format(
-        sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(),
-        options.device_opt(), options.pinned_memory_opt(), fixFormat);
+          sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(),
+          options.device_opt(), options.pinned_memory_opt(), fixFormat);
     }
 
     at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10::TensorOptions &options)
     {
       auto format = InferFormat::GuessBaseFormat(sizes);
       return NPUNativeFunctions::empty_with_format(
-        sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(),
-        options.device_opt(), options.pinned_memory_opt(), format);
+          sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(),
+          options.device_opt(), options.pinned_memory_opt(), format);
     }
 
     void OpPreparation::CheckMemory(const std::initializer_list<at::Tensor> &inputs, const std::initializer_list<at::Tensor> &outputs)
diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py
index 5620d851c2..5de085390e 100644
--- a/torch_npu/utils/module.py
+++ b/torch_npu/utils/module.py
@@ -67,35 +67,32 @@ def to(self, *args, **kwargs):
 
 
 def cast_weight(self, device):
-    if device is None:
-        return
-
-    if "npu" not in str(device):
+    if device is None or "npu" not in str(device):
         return
 
     current_class = self.__class__
     if issubclass(current_class, torch.nn.Linear):
         self.weight.data = self.weight.data.to(device)
-        self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) #ACL_FORMAT_FRACTAL_NZ
-    elif issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)):
-        if self.affine == True:
+        self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) # ACL_FORMAT_FRACTAL_NZ
+    if issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)):
+        if self.affine:
             self.weight.data = self.weight.data.to(device)
-            self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3)  #ACL_FORMAT_NC1HWC0
+            self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3)  # ACL_FORMAT_NC1HWC0
             self.bias.data = self.bias.data.to(device)
             self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3)
         self.running_mean.data = self.running_mean.data.to(device)
         self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3)
         self.running_var.data = self.running_var.data.to(device)
         self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3)
-    elif issubclass(current_class, torch.nn.Conv2d):
+    if issubclass(current_class, torch.nn.Conv2d):
         if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0):
             return
         self.weight.data = self.weight.data.to(device)
-        self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4)  #ACL_FORMAT_FRACTAL_Z
-    elif issubclass(current_class, torch.nn.Conv3d):
+        self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4)  # ACL_FORMAT_FRACTAL_Z
+    if issubclass(current_class, torch.nn.Conv3d):
         self.weight.data = self.weight.data.to(device)
-        self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float()  #ACL_FRACTAL_Z_3D
-    elif ("MultiheadAttention" in str(current_class)):
+        self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float()  # ACL_FRACTAL_Z_3D
+    if ("MultiheadAttention" in str(current_class)):
         if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \
             hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \
             hasattr(self,"v_proj_weight") and self.v_proj_weight is not None:
-- 
Gitee


From cfbe896ea6c3686a902860489d6d26132ceb0eef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=A4=E5=AE=89=E5=8D=87?= <youansheng@huawei.com>
Date: Mon, 14 Feb 2022 14:23:15 +0800
Subject: [PATCH 12/12] Fix Cyclomatic Complexity.

---
 torch_npu/utils/module.py | 79 +++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py
index 5de085390e..0d275ef923 100644
--- a/torch_npu/utils/module.py
+++ b/torch_npu/utils/module.py
@@ -67,46 +67,53 @@ def to(self, *args, **kwargs):
 
 
 def cast_weight(self, device):
+
+    def _format_cast(module, class_name):
+        if issubclass(class_name, torch.nn.Linear):
+            module.weight.data = module.weight.data.to(device)
+            module.weight.data = torch_npu.npu_format_cast(module.weight.data, 29) # ACL_FORMAT_FRACTAL_NZ
+        if issubclass(class_name, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)):
+            if module.affine:
+                module.weight.data = module.weight.data.to(device)
+                module.weight.data = torch_npu.npu_format_cast(module.weight.data, 3)  # ACL_FORMAT_NC1HWC0
+                module.bias.data = module.bias.data.to(device)
+                module.bias.data = torch_npu.npu_format_cast(module.bias.data, 3)
+            module.running_mean.data = module.running_mean.data.to(device)
+            module.running_mean.data = torch_npu.npu_format_cast(module.running_mean.data, 3)
+            module.running_var.data = module.running_var.data.to(device)
+            module.running_var.data = torch_npu.npu_format_cast(module.running_var.data, 3)
+        if issubclass(class_name, torch.nn.Conv2d):
+            if (module.in_channels == module.groups and module.groups > 1
+                and module.weight.size(0) % module.in_channels == 0):
+                return
+            module.weight.data = module.weight.data.to(device)
+            module.weight.data = torch_npu.npu_format_cast(module.weight.data, 4)  # ACL_FORMAT_FRACTAL_Z
+        if issubclass(class_name, torch.nn.Conv3d):
+            module.weight.data = module.weight.data.to(device)
+            module.weight.data = torch_npu.npu_format_cast(module.weight.data.half(), 33).float()  # ACL_FRACTAL_Z_3D
+        if "MultiheadAttention" in str(class_name) and \
+            hasattr(module,"q_proj_weight") and module.q_proj_weight and \
+            hasattr(module,"k_proj_weight") and module.k_proj_weight and \
+            hasattr(module,"v_proj_weight") and module.v_proj_weight:
+            module.q_proj_weight.data = module.q_proj_weight.data.to(device)
+            module.q_proj_weight.data = torch_npu.npu_format_cast(module.q_proj_weight.data, 29)
+            module.k_proj_weight.data = module.k_proj_weight.data.to(device)
+            module.k_proj_weight.data = torch_npu.npu_format_cast(module.k_proj_weight.data, 29)
+            module.v_proj_weight.data = module.v_proj_weight.data.to(device)
+            module.v_proj_weight.data = torch_npu.npu_format_cast(module.v_proj_weight.data, 29)
+
     if device is None or "npu" not in str(device):
         return
 
     current_class = self.__class__
-    if issubclass(current_class, torch.nn.Linear):
-        self.weight.data = self.weight.data.to(device)
-        self.weight.data = torch_npu.npu_format_cast(self.weight.data, 29) # ACL_FORMAT_FRACTAL_NZ
-    if issubclass(current_class, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)):
-        if self.affine:
-            self.weight.data = self.weight.data.to(device)
-            self.weight.data = torch_npu.npu_format_cast(self.weight.data, 3)  # ACL_FORMAT_NC1HWC0
-            self.bias.data = self.bias.data.to(device)
-            self.bias.data = torch_npu.npu_format_cast(self.bias.data, 3)
-        self.running_mean.data = self.running_mean.data.to(device)
-        self.running_mean.data = torch_npu.npu_format_cast(self.running_mean.data, 3)
-        self.running_var.data = self.running_var.data.to(device)
-        self.running_var.data = torch_npu.npu_format_cast(self.running_var.data, 3)
-    if issubclass(current_class, torch.nn.Conv2d):
-        if (self.in_channels == self.groups and self.groups > 1 and self.weight.size(0) % self.in_channels == 0):
-            return
-        self.weight.data = self.weight.data.to(device)
-        self.weight.data = torch_npu.npu_format_cast(self.weight.data, 4)  # ACL_FORMAT_FRACTAL_Z
-    if issubclass(current_class, torch.nn.Conv3d):
-        self.weight.data = self.weight.data.to(device)
-        self.weight.data = torch_npu.npu_format_cast(self.weight.data.half(), 33).float()  # ACL_FRACTAL_Z_3D
-    if ("MultiheadAttention" in str(current_class)):
-        if hasattr(self,"q_proj_weight") and self.q_proj_weight is not None and \
-            hasattr(self,"k_proj_weight") and self.k_proj_weight is not None and \
-            hasattr(self,"v_proj_weight") and self.v_proj_weight is not None:
-            self.q_proj_weight.data = self.q_proj_weight.data.to(device)
-            self.q_proj_weight.data = torch_npu.npu_format_cast(self.q_proj_weight.data, 29)
-            self.k_proj_weight.data = self.k_proj_weight.data.to(device)
-            self.k_proj_weight.data = torch_npu.npu_format_cast(self.k_proj_weight.data, 29)
-            self.v_proj_weight.data = self.v_proj_weight.data.to(device)
-            self.v_proj_weight.data = torch_npu.npu_format_cast(self.v_proj_weight.data, 29)
-
-    if self.children() is not None:
-        for sub_module in self.children():
-            if isinstance(sub_module, torch.nn.Module):
-                sub_module.cast_weight(device)
+    _format_cast(self, current_class)
+
+    if not self.children:
+        return
+
+    for sub_module in self.children():
+        if isinstance(sub_module, torch.nn.Module):
+            sub_module.cast_weight(device)
 
 
 def apply_module_patch():
-- 
Gitee