diff --git a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md
index 48ddd588986a17becfa7645d7a5c905a05cb490b..9ca63313afc7b4d46165f502210713f8b033e924 100644
--- a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md	
+++ b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md	
@@ -515,7 +515,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a
             Scalar other_c1_offset(
                 other.storage_offset() / (other.size(2) * other.size(3) * c0_len));
             Scalar stride_len(self.size(1) / c0_len);
-            Tensor result = at::npu_stride_add(
+            Tensor result = NPUNativeFunctions::npu_stride_add(
                 self_use, other_use, self_c1_offset, other_c1_offset, stride_len);
             return result;
           }
@@ -524,7 +524,8 @@ The following uses the torch.add\(\) operator as an example to describe how to a
           auto outputSize = broadcast_ops_npu_output_size(self, other);
         
           // construct the output tensor of the NPU
-          Tensor result = at::empty_with_format(
+          at::Tensor result = (self, outputSize, npu_format);
+          Tensor result = OpPreparation::ApplyTensorWithFormat(
               outputSize,
               outputTensor.options(),
               CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -541,7 +542,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a
           // calculate the output size
           auto outputSize = input_same_output_size(self);
           // construct the output tensor of the NPU
-          Tensor result = at::empty_with_format(
+          Tensor result = OpPreparation::ApplyTensorWithFormat(
               outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
         
           // calculate the output result of the NPU
diff --git "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
index f48ac3586507cd1602f7a8259e1508bbe3d2cac9..e2f6a2c9fadac7965ee26db268036d50df760b6b 100644
--- "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
+++ "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
@@ -515,7 +515,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。
             Scalar other_c1_offset(
                 other.storage_offset() / (other.size(2) * other.size(3) * c0_len));
             Scalar stride_len(self.size(1) / c0_len);
-            Tensor result = at::npu_stride_add(
+            Tensor result = NPUNativeFunctions::npu_stride_add(
                 self_use, other_use, self_c1_offset, other_c1_offset, stride_len);
             return result;
           }
@@ -524,7 +524,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。
           auto outputSize = broadcast_ops_npu_output_size(self, other);
         
           // construct the output tensor of the NPU
-          Tensor result = at::empty_with_format(
+          Tensor result = OpPreparation::ApplyTensorWithFormat(
               outputSize,
               outputTensor.options(),
               CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -541,7 +541,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。
           // calculate the output size
           auto outputSize = input_same_output_size(self);
           // construct the output tensor of the NPU
-          Tensor result = at::empty_with_format(
+          Tensor result = OpPreparation::ApplyTensorWithFormat(
               outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
         
           // calculate the output result of the NPU
diff --git a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py
index 7f9e5e4d0a11d208f3547c9043b071551011f634..52585e231104ab9daa514f5ea9831110172df118 100644
--- a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py
+++ b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py
@@ -47,7 +47,7 @@ class TestBatchNormGatherStatsWithCounts(TestCase):
         input1 = np.array(data).astype(dtype)
         npu_counts = torch.from_numpy(input1).to("npu:0")
         if npu_format != -1:
-            npu_counts = npu_counts.npu_format_cast(npu_format)
+            npu_counts = torch_npu.npu_format_cast(npu_counts, npu_format)
         return npu_counts
 
     def create_counts_tensor16(self, item):
@@ -58,7 +58,7 @@ class TestBatchNormGatherStatsWithCounts(TestCase):
         input1 = np.array(data).astype(dtype)
         npu_counts = torch.from_numpy(input1).to("npu:0")
         if npu_format != -1:
-            npu_counts = npu_counts.npu_format_cast(npu_format)
+            npu_counts = torch_npu.npu_format_cast(npu_counts, npu_format)
         return npu_counts
 
     def test_batch_norm_gather_stats_with_counts(self, device):
diff --git a/test/test_network_ops/test_uniform_.py b/test/test_network_ops/test_uniform_.py
index 893adf140e34b82bb03b8732ecf7c9becf3224e4..de4a3a96697a825ce0f10b9cfddb1d427f283de6 100644
--- a/test/test_network_ops/test_uniform_.py
+++ b/test/test_network_ops/test_uniform_.py
@@ -39,7 +39,7 @@ class TestUniform(TestCase):
 
         for item in shape_format:
             input1 = torch.zeros(item[0], dtype=item[3]).npu()
-            input1.npu_format_cast(3)
+            input1 = torch_npu.npu_format_cast(input1, 3)
             input1.uniform_(item[1], item[2])
             self.assertTrue(item[1] <= input1.min())
             self.assertTrue(item[2] >= input1.max())
diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index cedd54404fc7b8ddf019ccc4259a5594e2b8dc4b..de976de73043c85eb346f6eeb33614fe4c9cd821 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -24,6 +24,8 @@ import torch_npu.npu.amp
 import torch_npu.distributed
 import torch_npu._C
 
+from torch_npu.utils import nn_monkey_patches, apply_module_patch
+
 from .version import __version__ as __version__
 
 __all__ = []
@@ -35,16 +37,20 @@ for name in dir(torch_npu._C._VariableFunctions):
     globals()[name] = getattr(torch_npu._C._VariableFunctions, name)
     __all__.append(name)
 
+all_monkey_patches = [
+    ["npu", torch_npu.npu],
+    ["npu.amp", torch_npu.npu.amp],
+    ["autograd.profiler", torch_npu.npu.profiler],
+    ["distributed", torch_npu.distributed],
+    ["distributed.distributed_c10d", torch_npu.distributed.distributed_c10d],
+    ["nn.parallel.distributed._get_default_group", torch_npu.distributed.distributed_c10d._get_default_group]
+]
+
+all_monkey_patches += nn_monkey_patches
+
 
-def _apply_patches():
-    monkey_patches = [
-        ["npu", torch_npu.npu],
-        ["npu.amp", torch_npu.npu.amp],
-        ["autograd.profiler", torch_npu.npu.profiler],
-        ["distributed", torch_npu.distributed],
-        ["distributed.distributed_c10d", torch_npu.distributed.distributed_c10d],
-        ["nn.parallel.distributed._get_default_group", torch_npu.distributed.distributed_c10d._get_default_group]
-    ]
+def _apply_patches(monkey_patches):
+    
     def _getattr(module_list, root_module=torch):
         if len(module_list) <= 1:
             return root_module
@@ -54,7 +60,7 @@ def _apply_patches():
         else:
             empty_module_name = f'{root_module.__name__}.{module_list[0]}'
             sys.modules[empty_module_name] = types.ModuleType(empty_module_name)
-            setattr(root_module, module_list[0], sys.modules[empty_module_name])
+            setattr(root_module, module_list[0], sys.modules.get(empty_module_name))
             return _getattr(module_list[1:], getattr(root_module, module_list[0]))
 
     for patch_pair in monkey_patches:
@@ -76,7 +82,8 @@ def _apply_patches():
             setattr(dest_module, attr, getattr(patch, attr))
 
 # Apply monkey-patches.
-_apply_patches()
+_apply_patches(all_monkey_patches)
+apply_module_patch()
 
 # NPU exit, need to synchronize devices
 def _npu_shutdown():
diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp
index 6bc360ae5bc1360064c35195fda6f612c1116470..e511fb30dd288f0ed08752de8fce9ba72d3ddc97 100644
--- a/torch_npu/csrc/aten/common/CopyKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernel.cpp
@@ -188,7 +188,7 @@ void copy_d2d_dtype_format(at::Tensor& self, const at::Tensor& src, bool non_blo
     at::Tensor src_4D = FormatCastHelper::ApplyBaseFormatTensorBy(src);
     at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self);
     copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking);
-    self.npu_format_cast_(dst_4D);
+    NPUNativeFunctions::npu_format_cast_(self, dst_4D);
     return;
   }
   copy_d2d_dtype_baseformat(self, src, non_blocking);
@@ -312,7 +312,7 @@ void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
   if (!FormatHelper::IsBaseFormatType(self)) {
     at::Tensor dst = OpPreparation::ApplyTensor(self);
     copy_h2d_baseformat(dst, src, non_blocking, true);
-    self.npu_format_cast_(dst);
+    NPUNativeFunctions::npu_format_cast_(self, dst);
     return;
   }
   copy_h2d_baseformat(self, src, non_blocking);
@@ -363,7 +363,7 @@ void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking)
     }
     at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self);
     copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking);
-    self.npu_format_cast_(dst_4D);
+    NPUNativeFunctions::npu_format_cast_(self, dst_4D);
     return;
   }
   copy_d2d_dtype_format(self, src, non_blocking);
diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
index aa98978e8a5cba9188bf2c7fd1b8877a99a209a4..13d82c3f5f83c0720db21367ce6e089248f36b87 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
@@ -26,9 +26,9 @@ bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor&
   return FormatHelper::GetBaseFormat(src_format) == FormatHelper::GetBaseFormat(dst_format);
 }
 
-void FormatCastHelper::base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src) {
+void FormatCastHelper::base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src) {
   dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
-  dst.copy_memory_(src, true);
+  NPUNativeFunctions::copy_memory_(dst, src, true);
 }
 
 void FormatCastHelper::format_cast_as_base_format(const at::Tensor& src, aclFormat format) {
diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.h b/torch_npu/csrc/aten/common/FormatCastHelper.h
index 91e9b78182ab2c6af387d62799601ad134ac8d4c..ea2b6ab507a036c88be8e08f2fc24ae097d64270 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.h
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.h
@@ -33,7 +33,7 @@ public:
   static at::Tensor& CovertSelfToBaseFormat(at::Tensor& src);
 private:
   // help function of format_cast_between_group
-  static void base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src);
+  static void base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src);
 }; // class FormatCastHelper
 
 } // namespace native
diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
index 2139cfdb4a9082ec7f2965db6e667198b466c981..c518156b0a4c281420fab2e229102037e8292ad6 100644
--- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
@@ -76,7 +76,7 @@ at::Tensor NPUNativeFunctions::npu_format_cast(
   TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half,
       "can not cast format when src is not float32 or float16");
 
-  at::Tensor dst = at::empty_with_format(
+  at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
       src_desc.base_sizes_, src.options(), acl_format);
 
   // calculate the output result of the NPU
@@ -105,7 +105,7 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_(
   TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half,
       "can not cast format when src is not float32 or float16");
 
-  at::Tensor dst = at::empty_with_format(
+  at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
       src_desc.base_sizes_, src.options(), acl_format);
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/common/NpuFastReshape.cpp b/torch_npu/csrc/aten/common/NpuFastReshape.cpp
index e81d4f0c2984afb07f9eadd7fc0854a62682820a..e7df99dc6a30f7214f612cd28fc6d6470649d040 100644
--- a/torch_npu/csrc/aten/common/NpuFastReshape.cpp
+++ b/torch_npu/csrc/aten/common/NpuFastReshape.cpp
@@ -17,6 +17,7 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace at_npu {
 namespace native {
@@ -45,7 +46,7 @@ void npu_fast_reshape_(at::Tensor& tensor) {
   // refresh matadata to input tensor
   StorageDescHelper::ReflushDescBySelf(tensor);
   auto base_format = InferFormat::GuessBaseFormat(tensor.sizes());
-  tensor.npu_format_cast_(base_format);
+  NPUNativeFunctions::npu_format_cast_(tensor, base_format);
 }
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/aten/common/ResizeNpu.cpp b/torch_npu/csrc/aten/common/ResizeNpu.cpp
index e05736bf2e480927258b30ab7d6e4bc0ccc9a66c..35faadb6f2c7ec353be756587b10fe756f5d0e87 100644
--- a/torch_npu/csrc/aten/common/ResizeNpu.cpp
+++ b/torch_npu/csrc/aten/common/ResizeNpu.cpp
@@ -31,7 +31,7 @@ at::Tensor& NPUNativeFunctions::resize_(
   // because of resize _impl_npu_ only support at base format, so
   // no need to reflush NpuStorageDesc here.
   if (!FormatHelper::IsBaseFormatType(self)) {
-    self.npu_format_cast_(FormatHelper::GetBaseFormat(self));
+    NPUNativeFunctions::npu_format_cast_(self, FormatHelper::GetBaseFormat(self));
   }
   auto* self_ = self.unsafeGetTensorImpl();
   resize_impl_npu_(self_, size, /*strides=*/c10::nullopt);
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index 3d95b195a17a1aabf1cb5ce721dac7bc66f9367d..c2a9ae6eabaad42410c47e81877c2ea3e7cd2891 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -30,11 +30,12 @@
 #include <ATen/NamedTensorUtils.h>
 #include <c10/util/Exception.h>
 #include <c10/npu/NPUCachingAllocator.h>
+#include <ATen/record_function.h>
+
 #include "torch_npu/csrc/aten/common/ResizeNpu.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h"
-#include <ATen/record_function.h>
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/core/tensor_impl.h"
@@ -230,7 +231,7 @@ namespace at_npu
         {
           auto npu_format =
               self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_;
-          result = at::empty_with_format(self.sizes(), self.options(), npu_format);
+          result = OpPreparation::ApplyTensorWithFormat(self.sizes(), self.options(), npu_format);
         }
       }
 
@@ -347,7 +348,7 @@ namespace at_npu
       options.layout(layout_opt);
       options.pinned_memory(pin_memory_opt);
       at::Tensor result =
-          at::empty_with_format(size, options, dst_format);
+          OpPreparation::ApplyTensorWithFormat(size, options, dst_format);
       if (names.has_value())
       {
         internal_set_names_inplace(result, names);
@@ -361,7 +362,7 @@ namespace at_npu
                                      int64_t dst_format)
     {
       at::Tensor result =
-          at::empty_with_format(size, options, dst_format);
+          OpPreparation::ApplyTensorWithFormat(size, options, dst_format);
       if (names.has_value())
       {
         internal_set_names_inplace(result, names);
@@ -376,7 +377,7 @@ namespace at_npu
                                           int64_t dst_format)
     {
       at::Tensor result =
-          at::empty_with_format(size, options, dst_format);
+          OpPreparation::ApplyTensorWithFormat(size, options, dst_format);
       if (names.has_value())
       {
         internal_set_names_inplace(result, names);
diff --git a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
index 19db800366429c9bedaad43374429a142de6d743..0e4ed1eb1a79c82f1d94c1d52093ea48043f20f5 100644
--- a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp
@@ -170,7 +170,7 @@ namespace at_npu
       else
       {
         c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-        at::Tensor src_new = at::empty_with_format(
+        at::Tensor src_new = OpPreparation::ApplyTensorWithFormat(
             src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
         src_new.set_(
             src.storage(),
@@ -206,7 +206,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(self, other);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -223,7 +223,7 @@ namespace at_npu
       // calculate the output size
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp
index 7cdac57d78e7325e08861421c939f59f21fdfb80..c7a287bdab0597e452c03d96aaa8626608ab1995 100644
--- a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp
@@ -70,7 +70,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self, int64_t dim, bool kee
   auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim);
 
   // construct the output tensor of the NPU
-  at::Tensor result = at::empty_with_format(
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
   // calculate the output result of the NPU  
@@ -88,7 +88,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self) {
   // when self's dim = 0, convert [1] tensor and reduce it
   if (self.dim() == 0) {
       at::Tensor self_tmp = self;
-      self_tmp = at::empty_with_format(
+      self_tmp = OpPreparation::ApplyTensorWithFormat(
           {1}, 
           self.options().dtype(at::ScalarType::Float), 
           CalcuOpUtil::get_tensor_npu_format(self))
diff --git a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp
index 2e5df37da07969d61ad8dec892d3d94b8b868af1..9e42f4f0ef89c51f954a15dc4c76b6681ddec1d8 100644
--- a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp
@@ -112,7 +112,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, const at::Ten
   auto outputSize = broadcast_ops_npu_output_size(self, other);
 
   // construct the output at::Tensor of the NPU
-  at::Tensor result = at::empty_with_format(
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       outputSize,
       ref_tensor.options(),
       CalcuOpUtil::get_tensor_npu_format(ref_tensor));
@@ -128,7 +128,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, at::Scalar ot
   auto outputSize = input_same_output_size(self);
 
   // construct the output at::Tensor of the NPU
-  at::Tensor result = at::empty_with_format(
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
index 4042b6f678cfd6ed37bb6da08fb845a4e793015c..14f62aa86baf7ed9e735dc8c58908c3d48993057 100644
--- a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
@@ -73,9 +73,9 @@ at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat
   // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去
   if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) &&
       !torch_npu::option::OptionsManager::CheckSwitchMMOutputEnable()) {
-    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
+    result = OpPreparation::ApplyTensorWithFormat(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
   } else {
-    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND);
+    result = OpPreparation::ApplyTensorWithFormat(outputSize, self.options(), ACL_FORMAT_ND);
   }
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp
index 2f696eb0ad35d92d8de4bdcbd55ebc7f70658948..f928e9773889e7d145792e04cf6368d31655eec0 100644
--- a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp
@@ -83,9 +83,9 @@ at::Tensor pure_bmm_v2_npu(const at::Tensor& self, const at::Tensor& mat2, const
   at::Tensor result;
 
   if ((tensor1.scalar_type() == at::ScalarType::Half)) {
-    result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ);
+    result = OpPreparation::ApplyTensorWithFormat(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ);
   } else {
-    result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND);
+    result = OpPreparation::ApplyTensorWithFormat(output_size, tensor1.options(), ACL_FORMAT_ND);
   }
 
   at::Tensor contiguous_self = tensor1;
diff --git a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp
index 79c6fdae296474ee93c073587111e80dfe678ef9..f63a18da30580cb73ca066bbda63dbe531ed0743 100644
--- a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp
@@ -53,7 +53,7 @@ namespace at_npu
         input = input.to(at::kInt);
       }
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           size,
           input.options(),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp
index a033c8c511ec970d4f86479c82c8449f664c9794..b7ca932113f802962720ea14fd9ccd792d0fd90a 100644
--- a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp
@@ -85,7 +85,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(self, other);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -102,7 +102,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           self.options(),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp
index 3dff36422833c16153fb761e32cb03e4d3485979..fa27bc376a46b8306ca6d9b5974dea18bfb83658 100644
--- a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp
@@ -95,7 +95,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -113,7 +113,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -131,7 +131,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(c10::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
@@ -159,7 +159,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(c10::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp
index 50d4c3fbcda623feaac7431e08e81fe1d65eecb4..4b453091b6f5c74ee035654cd7fad05f34f39b40 100644
--- a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp
@@ -104,7 +104,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -121,7 +121,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool),
           ACL_FORMAT_ND);
@@ -139,7 +139,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
@@ -167,7 +167,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
index 40bf84de52da26c8a7da34c5c59c611d46816750..1aa6ffb9dcc28898a44d8f71cd36fcd449de746e 100644
--- a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
@@ -42,7 +42,7 @@ at::Tensor NPUNativeFunctions::index(const at::Tensor& self, const torch::List<c
   at::native::checkIndexTensorTypes(orig);
   // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
   auto indices = at::native::expandTensors(self, orig);
-  at::Tensor formatCastOfSelf = self.npu_format_cast(ACL_FORMAT_ND);
+  at::Tensor formatCastOfSelf = NPUNativeFunctions::npu_format_cast(self, ACL_FORMAT_ND);
 
   // calculate the output size
   auto outputSize = index_npu_output_size(formatCastOfSelf, indices);
diff --git a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
index 29cf56e55ed6ad90cd240060a2c227ed5c5206d4..1503d75a52a3ba2f884e22953b50587b59b62cc4 100644
--- a/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/LtKernelNpu.cpp
@@ -100,7 +100,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithSizes(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool));
 
@@ -116,7 +116,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(formatCastOfSelf);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithSizes(
           outputSize,
           formatCastOfSelf.options().dtype(at::kBool));
 
@@ -133,7 +133,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
@@ -161,7 +161,7 @@ namespace at_npu
       c10::SmallVector<at::Tensor, N> outputs = {self};
       CalcuOpUtil::check_memory_over_laps(inputs, outputs);
 
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(),
           self.options().dtype(at::ScalarType::Byte),
           CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp
index 0f6e2b6875ee4064b3a992cc064e0dfda2769ca1..8ca851a36d906b60afa54388b61d5f3f23b12361 100644
--- a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp
@@ -142,7 +142,7 @@ namespace at_npu
       }
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options().dtype(dstType), npu_format);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
index ceba5c8a22cb635ddaf98f7495126365b1053941..05bd74b0e0a6dad3866d2e6722ef97a670a98c22 100644
--- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp
@@ -183,12 +183,12 @@ Return:
 
       if ((self.scalar_type() == at::ScalarType::Half) && !torch_npu::option::OptionsManager::CheckSwitchMMOutputEnable())
       {
-        result = at::empty_with_format(
+        result = OpPreparation::ApplyTensorWithFormat(
             outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
       }
       else
       {
-        result = at::empty_with_format(outputSize, self.options());
+        result = OpPreparation::ApplyTensorWithSizes(outputSize, self.options());
       }
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp
index 1fa1b7afc88814b1ab3962ab716b90c39286f11b..d9982ffc566a6f2268e9e0e66e227de818f418cc 100644
--- a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp
@@ -113,7 +113,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(selfCast, otherCast);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -135,7 +135,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp
index a04ed1ff90608d7c0ae23bacfb604c2c75dc8832..a3ae15a640ec2c1c35718fb430a30856f0e73c97 100644
--- a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp
@@ -50,7 +50,7 @@ namespace at_npu
     at::Tensor NPUNativeFunctions::neg(const at::Tensor &self)
     {
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp
index 5135cd4a2763ce37a4afea750afd524b3a5d13e7..e8bfa33690ab8d2d577f13fe3d666744900431ea 100644
--- a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp
@@ -118,7 +118,7 @@ namespace at_npu
       TORCH_CHECK(std > 0.0, "normal_ expects std > 0.0, but found std=", std);
 
       // the op of PTNormalFloatFloat only support format of ND
-      at::Tensor formatCastOfResult = result.npu_format_cast(ACL_FORMAT_ND);
+      at::Tensor formatCastOfResult = NPUNativeFunctions::npu_format_cast(result, ACL_FORMAT_ND);
       if (formatCastOfResult.scalar_type() == at::ScalarType::Half)
       {
         formatCastOfResult = formatCastOfResult.to(at::ScalarType::Float);
@@ -182,7 +182,7 @@ namespace at_npu
         c10::optional<bool> pin_memory_opt)
     {
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
           size, dtype_opt, layout_opt, device_opt, pin_memory_opt, ACL_FORMAT_ND);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
index 12dd1e754252ada59d1571cca520ab0b9df90c0d..4e65798b226b6925be7fe900c2ee3a1e49684f97 100644
--- a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp
@@ -45,12 +45,9 @@ namespace at_npu
 
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(outputSize,
-                                                dtype_opt,
-                                                layout_opt,
-                                                device_opt,
-                                                pin_memory_opt,
-                                                CalcuOpUtil::get_tensor_npu_format(self));
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
+          outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt,
+          CalcuOpUtil::get_tensor_npu_format(self));
       // calculate the output result of the NPUc
       return NPUNativeFunctions::one_(result);
     }
diff --git a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
index d8ab5517335c7aac344a6d9ca4cd4a71b1de63c5..1f4331abc0efe4773755e5f861a85e635e37bed0 100644
--- a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp
@@ -17,7 +17,7 @@
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
-
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace at_npu
@@ -65,7 +65,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp
index cdbd32be1de5ee336f678a64a1b6daa3f6e2bac5..bb0050fe1889f881f826a3db80e3c867b22d019e 100644
--- a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp
@@ -52,17 +52,18 @@ namespace at_npu
       // calculate the output size
       auto outputSize = input_same_output_size(grad_output);
 
+      at::Tensor tmp_output = output;
       // output'format must be same with grad_output
-      if (CalcuOpUtil::get_tensor_npu_format(output) != CalcuOpUtil::get_tensor_npu_format(grad_output))
+      if (CalcuOpUtil::get_tensor_npu_format(tmp_output) != CalcuOpUtil::get_tensor_npu_format(grad_output))
       {
-        output.npu_format_cast_(CalcuOpUtil::get_tensor_npu_format(grad_output));
+        NPUNativeFunctions::npu_format_cast_(tmp_output, CalcuOpUtil::get_tensor_npu_format(grad_output));
       }
 
       // construct the output tensor of the NPU
       at::Tensor grad_input = OpPreparation::ApplyTensor(grad_output, outputSize);
 
       // calculate the output result of the NPU
-      softmax_backward_out_npu(grad_input, grad_output, output, dim, self);
+      softmax_backward_out_npu(grad_input, grad_output, tmp_output, dim, self);
 
       return grad_input;
     }
diff --git a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
index 79783b2f4c40f7ab1738b3072dbdf08ad7c80c2d..eb85dfefab763a445d32b5e4012eb1132e09a056 100644
--- a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp
@@ -17,6 +17,7 @@
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 namespace at_npu
@@ -119,7 +120,7 @@ namespace at_npu
       outputSize[1] = c1_len.toInt() * 16;
 
       // construct the output at::Tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp
index fe90315466cfe4bf1a0fe6050284792c398428b1..33ac4018eee507020e5312ed0c371dc560abcd54 100644
--- a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp
@@ -103,7 +103,7 @@ namespace at_npu
       auto outputSize = broadcast_ops_npu_output_size(self, other);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize,
           outputTensor.options(),
           CalcuOpUtil::get_tensor_npu_format(outputTensor));
@@ -120,7 +120,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp
index 48129f2a6bfac787148be744e6f2ce122996c487..23819bc687ef79e30b65d23f2a2768fc16c79ef2 100644
--- a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp
@@ -197,7 +197,7 @@ namespace at_npu
       }
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options().dtype(dstType), npu_format);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
index 537bcc2444c8b21eefcecaa7d24ab37585913db4..0bc0118a9cddbc4e1d656c361a11f6b39f61d26a 100644
--- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp
@@ -62,7 +62,7 @@ namespace at_npu
       auto outputSize = input_same_output_size(self);
 
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(
           outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
       // use 5HD in Relu
diff --git a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp
index 0bddf27eae72b027412ec0c6ddc55ca0ba331fc5..9a4bd8fb5a805ebef6c7fb4ab5a6f8ac7c38c73a 100644
--- a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp
@@ -219,11 +219,11 @@ namespace at_npu
         // construct the output tensor of the NPU
         at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm);
         auto outputSize = transpose_npu_output_size(values, perm);
-        at::Tensor transposeValue = at::empty_with_format(
+        at::Tensor transposeValue = OpPreparation::ApplyTensorWithFormat(
             outputSize,
             values.options(),
             CalcuOpUtil::get_tensor_npu_format(values));
-        at::Tensor transposeIndices = at::empty_with_format(
+        at::Tensor transposeIndices = OpPreparation::ApplyTensorWithFormat(
             outputSize,
             indices.options(),
             CalcuOpUtil::get_tensor_npu_format(indices));
@@ -290,9 +290,9 @@ namespace at_npu
       // calculate the output size
       auto outputSize = topk_npu_output_size(selfCp, k, dim, largest, sorted);
       // construct the output tensor of the NPU
-      at::Tensor values = at::empty_with_format(
+      at::Tensor values = OpPreparation::ApplyTensorWithFormat(
           outputSize, selfCp.options(), CalcuOpUtil::get_tensor_npu_format(selfCp));
-      at::Tensor indices = at::empty_with_format(
+      at::Tensor indices = OpPreparation::ApplyTensorWithFormat(
           outputSize, selfCp.options().dtype(at::kInt), ACL_FORMAT_ND);
 
       // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp
index fcda11d5bcadb84f9162c30806080bb9ccdcf6e0..40631abc0b1c999cecc9d9966e68e4f7e2d03e7f 100644
--- a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp
@@ -76,7 +76,7 @@ namespace at_npu
     {
       RECORD_FUNCTION("transpose_to_contiguous", vector<c10::IValue>({self}));
       int64_t self_format = CalcuOpUtil::get_tensor_npu_format(self);
-      at::Tensor result = at::empty_with_format(self.sizes(), self.options(), self_format);
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(self.sizes(), self.options(), self_format);
 
       // obtain the transpose axises
       at::IntArrayRef dim;
diff --git a/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp b/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp
index edb450c9cf678115cb48c900dd012aca93eab0c8..8976e4f1b1fd2ded5347957255f95473a90b0f85 100644
--- a/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp
@@ -92,7 +92,7 @@ vector<at::Tensor> NPUNativeFunctions::where(const at::Tensor& condition) {
   at::Tensor formatCastOfCondition = condition;
   if (condition.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ !=
     ACL_FORMAT_ND) {
-    formatCastOfCondition = formatCastOfCondition.npu_format_cast(ACL_FORMAT_ND);
+    formatCastOfCondition = NPUNativeFunctions::npu_format_cast(formatCastOfCondition, ACL_FORMAT_ND);
   }
   if (condition.scalar_type() == at::ScalarType::Half) {
     formatCastOfCondition = NPUNativeFunctions::npu_dtype_cast(formatCastOfCondition, at::ScalarType::Float);
diff --git a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp
index 18ec3038d102a4f379fb87876c0fe3efb276eb34..8147622a0130525911f788b0f8b54693eff8bd4b 100644
--- a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp
@@ -56,12 +56,9 @@ namespace at_npu
 
       auto outputSize = input_same_output_size(self);
       // construct the output tensor of the NPU
-      at::Tensor result = at::empty_with_format(outputSize,
-                                                dtype_opt,
-                                                layout_opt,
-                                                device_opt,
-                                                pin_memory_opt,
-                                                CalcuOpUtil::get_tensor_npu_format(self));
+      at::Tensor result = NPUNativeFunctions::empty_with_format(
+          outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt,
+          CalcuOpUtil::get_tensor_npu_format(self));
 
       // calculate the output result of the NPU
       return result.zero_();
diff --git a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp
index dcddfa5d6563c703d906e7f2ab266e8d149514a1..b93bf1710d966dc1d75b85212a1d014c16453d09 100644
--- a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp
@@ -213,7 +213,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv2d_backwar
   at::Tensor gradBias;
   // construct the output tensor of the NPU
   if (grad_input_mask[0]) {
-    gradInput = at::empty_with_format(
+    gradInput = OpPreparation::ApplyTensorWithFormat(
         std::get<0>(outputSizes), input.options(), ACL_FORMAT_NC1HWC0);
   }
 
@@ -221,12 +221,12 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv2d_backwar
     // For group conv2d: keep consistent with weight to avoid allreduce accuracy problem.
     // For more info: https://gitee.com/ascend/pytorch-develop/pulls/2255
     if (groups > 1) {
-      gradWeight = at::empty_with_format(
+      gradWeight = OpPreparation::ApplyTensorWithFormat(
           std::get<1>(outputSizes),
           weight.options().dtype(at::kFloat),
           ACL_FORMAT_NCHW);      
     } else {
-      gradWeight = at::empty_with_format(
+      gradWeight = OpPreparation::ApplyTensorWithFormat(
           std::get<1>(outputSizes),
           weight.options().dtype(at::kFloat),
           ACL_FORMAT_FRACTAL_Z);      
@@ -234,7 +234,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv2d_backwar
   }
 
   if (grad_input_mask[2]) {
-    gradBias = at::empty_with_format(
+    gradBias = OpPreparation::ApplyTensorWithFormat(
         std::get<2>(outputSizes), grad.options(), ACL_FORMAT_NCHW);
   }
 
diff --git a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp
index 48be4e0d619f87479fc2cd5db928eecc28d318e8..b38ef864a05aa48868d63da4c0b493f60a390bcb 100644
--- a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp
@@ -107,7 +107,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv3d_backwar
 
   if (grad_input_mask[0]) {
     // format should be NDC1HWC0
-    gradInput = at::empty_with_format(
+    gradInput = OpPreparation::ApplyTensorWithFormat(
         input.sizes(), input.options(), ACL_FORMAT_NDC1HWC0);
 
     conv3d_backward_inputmask(
@@ -116,7 +116,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv3d_backwar
 
   if (grad_input_mask[1]) {
     // format should be FRACTAL_Z_3D
-    gradWeight = at::empty_with_format(
+    gradWeight = OpPreparation::ApplyTensorWithFormat(
         weight.sizes(), weight.options().dtype(at::kFloat), ACL_FRACTAL_Z_3D);
 
     conv3d_backward_weightmask(
@@ -125,7 +125,7 @@ tuple<at::Tensor, at::Tensor, at::Tensor> NPUNativeFunctions::npu_conv3d_backwar
 
   if (grad_input_mask[2]) {
     // format should be NCHW, gradias.size = grad.size(1)
-    gradBias = at::empty_with_format(
+    gradBias = OpPreparation::ApplyTensorWithFormat(
         {grad.size(1)}, grad.options(), ACL_FORMAT_NCHW);
 
     conv3d_backward_biasmask(
diff --git a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp
index c6e2f7837824c6f25e3ff5c9372afdc45a341c45..a0a32368fbba8a9a2229796107397f571337738c 100644
--- a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp
@@ -81,7 +81,7 @@ at::Tensor NPUNativeFunctions::npu_conv_transpose2d(
 
   // construct the output tensor of the NPU
   at::Tensor result =
-      at::empty_with_format(outputSize, input.options(), ACL_FORMAT_NC1HWC0);
+      OpPreparation::ApplyTensorWithFormat(outputSize, input.options(), ACL_FORMAT_NC1HWC0);
 
   // calculate the output result of the NPU
   conv_transpose2d_out_npu(
diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp
index 5e31d05824c3434d08d08f941c9f0abca4580b0f..bf6da61f93fab158c5d09f567bc3522924ee740d 100644
--- a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp
@@ -91,7 +91,7 @@ at::Tensor NPUNativeFunctions::nll_loss_backward(
   auto outputSize = input_same_output_size(self);
 
   // construct the output tensor of the NPU
-  at::Tensor grad_input = at::empty_with_format(
+  at::Tensor grad_input = OpPreparation::ApplyTensorWithFormat(
       outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
 
   // calculate the output result of the NPU
diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp
index cfda03265f6e418a944ee01a093096009ec1c637..f274745813203aa15166bcfcaca0519c9170ea57 100644
--- a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp
@@ -96,11 +96,11 @@ tuple<at::Tensor, at::Tensor> NPUNativeFunctions::nll_loss_forward(
       outputSize, totalWeightSize);
 
   // construct the output tensor of the NPU
-  at::Tensor result = at::empty_with_format(
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       std::get<0>(outputSizes),
       self.options(),
       CalcuOpUtil::get_tensor_npu_format(self));
-  at::Tensor total_weight = at::empty_with_format(
+  at::Tensor total_weight = OpPreparation::ApplyTensorWithFormat(
       std::get<1>(outputSizes),
       self.options(),
       CalcuOpUtil::get_tensor_npu_format(self));
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 951ba5960741a71fe6c276abb7ac28d62e3273a3..e8717648c5ce728b8125839a61240e31588e3ac2 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -33,6 +33,7 @@
 #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "torch_npu/csrc/distributed/Init.h"
 #include "torch_npu/csrc/distributed/reducer.hpp"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 
 
 namespace torch_npu {
@@ -48,7 +49,7 @@ class BroadcastWork {
 public:
   inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors) {
     static auto cast_back_to_ori_format = [](const at::Tensor &t) { 
-      return t.npu_format_cast(t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); 
+      return at_npu::native::NPUNativeFunctions::npu_format_cast(t, t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); 
       };
     return c10::fmap(tensors, cast_back_to_ori_format);
   }
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index b31bd008d0deb0b45af5c866b1d7ccc279a85e16..81f7f049682ad718d3c2902348c4246e573925f8 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -31,6 +31,8 @@
 #include <torch/csrc/utils/memory.h>
 
 #include "torch_npu/csrc/distributed/reducer.hpp"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 
 namespace c10d_npu {
 namespace {
@@ -406,9 +408,9 @@ void Reducer::copy_grad_to_bucket(
   if (comm_hook_ == nullptr) {
     // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp
     // Divides while copying into the bucket view.
-    bucket_view.copy_memory_(grad.mul(float(1.) / divFactor_), true);
+    at_npu::native::NPUNativeFunctions::copy_memory_(bucket_view, grad.mul(float(1.) / divFactor_), true);
   } else {
-    bucket_view.copy_memory_(grad, true);
+    at_npu::native::NPUNativeFunctions::copy_memory_(bucket_view, grad, true);
   }
 }
 
@@ -441,7 +443,7 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) {
         // make sure grad has the same format as variable
         if (grad.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ !=
               variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_) {
-          grad = grad.npu_format_cast(
+          grad = at_npu::native::NPUNativeFunctions::npu_format_cast(grad,
               variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
         }
         this->copy_grad_to_bucket(grad, bucket_view);
@@ -1072,12 +1074,12 @@ void Reducer::copy_bucket_to_grad(
       if (!grad.defined()) {
         // Creates grad according to the "Gradient Layout Contract"
         // (see torch/csrc/grad/AccumulateGrad.h)
-        grad = at::empty_with_format(variable.sizes(),
-                                     bucket_view.options(),
-                                     variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
-        grad.copy_memory_(bucket_view, true);
+        grad = at_npu::native::OpPreparation::ApplyTensorWithFormat(
+            variable.sizes(), bucket_view.options(),
+            variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_);
+        at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
       } else {
-        grad.copy_memory_(bucket_view, true);
+        at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true);
       }
       // The grad is modified and needs to be written back.
       return true;
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
index 246adc7ec10c2bd5c7dd00fe8b8f307483988780..08a0e1a6cad6205ac307eebfbb9f073c67370d61 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 
 namespace at_npu
 {
@@ -109,7 +110,7 @@ namespace at_npu
         const at::Tensor &src,
         const std::vector<string> &optimizations)
     {
-      auto self = at::empty_with_format(
+      auto self = OpPreparation::ApplyTensorWithFormat(
           src.sizes(),
           src.options(),
           src.storage().get_npu_desc().npu_format_);
diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
index a0fa15f3cb95fbc3efc8e710dd42a8208371c2da..1ce04a827f2e1b4d2830b779b89bfe2ea251490e 100644
--- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
@@ -16,8 +16,10 @@
 #include <map>
 #include <ATen/quantized/QTensorImpl.h>
 #include <ATen/NamedTensorUtils.h>
+
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 
 namespace at_npu
 {
@@ -498,7 +500,7 @@ namespace at_npu
           {
             // case 2: The first tensor is discontiguous-type,
             // conduct the standard optimization procedure.
-            auto contiguous_src = at::empty_with_format(
+            auto contiguous_src = OpPreparation::ApplyTensorWithFormat(
                 src.sizes(),
                 src.options(),
                 src.storage().get_npu_desc().npu_format_);
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp
index a0efe852f78d13a31e6efcdce65a6014b6015c1a..07bda7c6a64e1f05af970139288a49047f68f9da 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.cpp
+++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp
@@ -16,7 +16,6 @@
 
 #include <mutex>
 #include <set>
-#include <c10/npu/register/OptionRegister.h>
 
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
@@ -25,6 +24,8 @@
 #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/interface/EnvVariables.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 
 namespace at_npu
 {
@@ -165,7 +166,7 @@ namespace at_npu
       // 3. get output size
       auto outputSize = index_select_npu_output_size(src_tmp, dim, index);
       int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(src_tmp);
-      at::Tensor result = at::empty_with_format(outputSize, src_tmp.options(), npu_format);
+      at::Tensor result = OpPreparation::ApplyTensorWithFormat(outputSize, src_tmp.options(), npu_format);
       // std::cout << "npu_format: " << npu_format << std::endl;
 
       // 4. get input and output
@@ -208,7 +209,7 @@ namespace at_npu
     at::Tensor deal_with_5d_5d_match(const at::Tensor &src)
     {
       auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-      at::Tensor src_new = at::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
+      at::Tensor src_new = OpPreparation::ApplyTensorWithFormat(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0);
       c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream();
       int64_t numel = src_new.numel();
       aclError error = aclrtMemcpyAsync(
@@ -252,7 +253,7 @@ namespace at_npu
       // a temporary tensor, which always monopolizes its own storage.
       if (numelEq && (!FormatHelper::IsBaseFormatType(src)))
       {
-        at::Tensor tempTensor = at::npu_format_cast(src, FormatHelper::GetBaseFormat(src));
+        at::Tensor tempTensor = NPUNativeFunctions::npu_format_cast(src, FormatHelper::GetBaseFormat(src));
         auto &temp_desc =
             tempTensor.storage().unsafeGetStorageImpl()->npu_desc_;
         temp_desc.base_sizes_ = tempTensor.sizes();
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index bf70fb7277814d6e5f2f29a89e4f2383aeb9033b..3726a9765a675c7e0b5765ba1ae043302a7e4d6b 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -17,6 +17,8 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
 
 namespace at_npu
 {
@@ -176,13 +178,13 @@ namespace at_npu
         if (output.scalar_type() == at::ScalarType::Float || output.scalar_type() == at::ScalarType::Half)
         {
           TORCH_CHECK(!is_read_write, "can not cast format when output is input");
-          output.npu_format_cast_(format);
+          NPUNativeFunctions::npu_format_cast_(output, format);
         }
         else
         {
           TORCH_CHECK(FormatHelper::IsBaseFormatType(output) && FormatHelper::IsBaseFormatType(static_cast<aclFormat>(format)),
                       "can not cast format to un-base format when output has bool dtype");
-          output.npu_format_cast_(format);
+          NPUNativeFunctions::npu_format_cast_(output, format);
         }
       }
     }
@@ -197,7 +199,7 @@ namespace at_npu
     at::Tensor &OpPreparation::CastBackToOriFormat(at::Tensor &tensor)
     {
       auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_;
-      tensor.npu_format_cast_(tensor_desc.origin_format_);
+      NPUNativeFunctions::npu_format_cast_(tensor, tensor_desc.origin_format_);
       return tensor;
     }
 
@@ -234,13 +236,17 @@ namespace at_npu
     at::Tensor OpPreparation::ApplyTensorWithFormat(c10::IntArrayRef sizes, const c10::TensorOptions &options, int64_t format)
     {
       auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format);
-      return at::empty_with_format(sizes, options, fixFormat);
+      return NPUNativeFunctions::empty_with_format(
+          sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(),
+          options.device_opt(), options.pinned_memory_opt(), fixFormat);
     }
 
     at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10::TensorOptions &options)
     {
       auto format = InferFormat::GuessBaseFormat(sizes);
-      return at::empty_with_format(sizes, options, format);
+      return NPUNativeFunctions::empty_with_format(
+          sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(),
+          options.device_opt(), options.pinned_memory_opt(), format);
     }
 
     void OpPreparation::CheckMemory(const std::initializer_list<at::Tensor> &inputs, const std::initializer_list<at::Tensor> &outputs)
diff --git a/torch_npu/testing/util_test.py b/torch_npu/testing/util_test.py
index a460af4ae332f2fe2a229946dfc58da3c3e0483e..835814c30d45ef5933293b04c3ac18da6f353d59 100644
--- a/torch_npu/testing/util_test.py
+++ b/torch_npu/testing/util_test.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import torch
+import torch_npu
 import numpy as np
 import os
 
@@ -42,7 +43,7 @@ def create_common_tensor(item, minValue, maxValue):
     cpu_input = torch.from_numpy(input1)
     npu_input = torch.from_numpy(input1).to(npu_device)
     if npu_format != -1:
-        npu_input = npu_input.npu_format_cast(npu_format)
+        npu_input = torch_npu.npu_format_cast(npu_input, npu_format)
     return cpu_input, npu_input
 
 
@@ -125,5 +126,5 @@ def create_dtype_tensor(shape, dtype, npu_format=-1, min_value=-5, max_value=5,
     cpu_input = torch.from_numpy(x)
     npu_input = torch.from_numpy(x).to(npu_device)
     if npu_format != -1 and (dtype in [torch.float, torch.half]):
-        npu_input = npu_input.npu_format_cast(npu_format)
+        npu_input = torch_npu.npu_format_cast(npu_input, npu_format)
     return cpu_input, npu_input
\ No newline at end of file
diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..092f9cebfba63364dd18297f48981e47e747999a
--- /dev/null
+++ b/torch_npu/utils/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .module import LayerNorm, apply_module_patch
+
+
+def _get_monkey_patches():
+    _monkey_patches = []
+    _monkey_patches.append(["nn.modules.normalization.LayerNorm", LayerNorm])
+    _monkey_patches.append(["nn.modules.LayerNorm", LayerNorm])
+    _monkey_patches.append(["nn.LayerNorm", LayerNorm])
+    return _monkey_patches
+
+
+nn_monkey_patches = _get_monkey_patches()
diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d275ef923ecd8f8a108a888964304c72abf1aa3
--- /dev/null
+++ b/torch_npu/utils/module.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+import torch
+import torch_npu
+
+
+def npu(self, device=None):
+    r"""Moves all model parameters and buffers to the npu.
+
+    This also makes associated parameters and buffers different objects. So
+    it should be called before constructing optimizer if the module will
+    live on npu while being optimized.
+
+    Arguments:
+        device (int, optional): if specified, all parameters will be
+            copied to that device
+
+    Returns:
+        Module: self
+    """
+    if device is None:
+        device = torch.device("npu")
+    if torch_npu.npu.is_available():
+        with torch.no_grad():
+            self.cast_weight(device)
+    return self._apply(lambda t: t.npu(device))
+
+
+def to(self, *args, **kwargs):
+    device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+
+    if dtype is not None:
+        if not (dtype.is_floating_point or dtype.is_complex):
+            raise TypeError('nn.Module.to only accepts floating point or complex '
+                            'dtypes, but got desired dtype={}'.format(dtype))
+        if dtype.is_complex:
+            warnings.warn(
+                "Complex modules are a new feature under active development whose design may change, "
+                "and some modules might not work as expected when using complex tensors as parameters or buffers. "
+                "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md "
+                "if a complex module does not work as expected.")
+    if torch_npu.npu.is_available():
+        with torch.no_grad():
+            self.cast_weight(device)
+
+    def convert(t):
+        if convert_to_format is not None and t.dim() == 4:
+            return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
+                        non_blocking, memory_format=convert_to_format)
+        return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
+
+    return self._apply(convert)
+
+
+def cast_weight(self, device):
+
+    def _format_cast(module, class_name):
+        if issubclass(class_name, torch.nn.Linear):
+            module.weight.data = module.weight.data.to(device)
+            module.weight.data = torch_npu.npu_format_cast(module.weight.data, 29) # ACL_FORMAT_FRACTAL_NZ
+        if issubclass(class_name, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)):
+            if module.affine:
+                module.weight.data = module.weight.data.to(device)
+                module.weight.data = torch_npu.npu_format_cast(module.weight.data, 3)  # ACL_FORMAT_NC1HWC0
+                module.bias.data = module.bias.data.to(device)
+                module.bias.data = torch_npu.npu_format_cast(module.bias.data, 3)
+            module.running_mean.data = module.running_mean.data.to(device)
+            module.running_mean.data = torch_npu.npu_format_cast(module.running_mean.data, 3)
+            module.running_var.data = module.running_var.data.to(device)
+            module.running_var.data = torch_npu.npu_format_cast(module.running_var.data, 3)
+        if issubclass(class_name, torch.nn.Conv2d):
+            if (module.in_channels == module.groups and module.groups > 1
+                and module.weight.size(0) % module.in_channels == 0):
+                return
+            module.weight.data = module.weight.data.to(device)
+            module.weight.data = torch_npu.npu_format_cast(module.weight.data, 4)  # ACL_FORMAT_FRACTAL_Z
+        if issubclass(class_name, torch.nn.Conv3d):
+            module.weight.data = module.weight.data.to(device)
+            module.weight.data = torch_npu.npu_format_cast(module.weight.data.half(), 33).float()  # ACL_FRACTAL_Z_3D
+        if "MultiheadAttention" in str(class_name) and \
+            hasattr(module,"q_proj_weight") and module.q_proj_weight and \
+            hasattr(module,"k_proj_weight") and module.k_proj_weight and \
+            hasattr(module,"v_proj_weight") and module.v_proj_weight:
+            module.q_proj_weight.data = module.q_proj_weight.data.to(device)
+            module.q_proj_weight.data = torch_npu.npu_format_cast(module.q_proj_weight.data, 29)
+            module.k_proj_weight.data = module.k_proj_weight.data.to(device)
+            module.k_proj_weight.data = torch_npu.npu_format_cast(module.k_proj_weight.data, 29)
+            module.v_proj_weight.data = module.v_proj_weight.data.to(device)
+            module.v_proj_weight.data = torch_npu.npu_format_cast(module.v_proj_weight.data, 29)
+
+    if device is None or "npu" not in str(device):
+        return
+
+    current_class = self.__class__
+    _format_cast(self, current_class)
+
+    if not self.children:
+        return
+
+    for sub_module in self.children():
+        if isinstance(sub_module, torch.nn.Module):
+            sub_module.cast_weight(device)
+
+
+def apply_module_patch():
+    torch.nn.Module.npu = npu
+    torch.nn.Module.to = to
+    torch.nn.Module.cast_weight = cast_weight
+
+
+class LayerNorm(torch.nn.LayerNorm):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.training:
+            return torch.nn.functional.layer_norm(
+                input, self.normalized_shape, self.weight, self.bias, self.eps)
+        else:
+            return torch_npu.npu_layer_norm_eval(input, self.normalized_shape, self.weight, self.bias, self.eps)
diff --git a/torch_npu/version.py b/torch_npu/version.py
index 32fb6cf7bf30cc67a5bf6bef7b2f2d33d6901856..8790208db40d2c1d0007a4131bba6e688595fd6e 100644
--- a/torch_npu/version.py
+++ b/torch_npu/version.py
@@ -1 +1,16 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 __version__ = "1.8.1rc1"