diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index c34eac8c074ca08ae4c8ec710abad70923e9e6ea..b57fa93b780cad2a8642da4eff93b93dcd206eab 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -18,6 +18,7 @@
 #include <torch/csrc/Exceptions.h>
 
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/framework/graph/execute/GraphExecutor.h"
 #include <c10/npu/sys_ctrl/npu_sys_ctrl.h>
 #include <torch/csrc/utils/npu_lazy_init.h>
 
@@ -52,6 +53,7 @@ PyObject * THPModule_npu_shutdown(PyObject * /* unused */)
   // all of op tasks completed before device memory free.
   if (c10::npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
     c10::npu::npuSynchronizeDevice();
+    at_npu::native::GraphExecutor::GetInstance().Finalize();
     THNPUCachingHostAllocator_emptyCache();
     c10_npu::NPUCachingAllocator::emptyCache();
     c10::npu::NpuSysCtrl::SysStatus status = c10::npu::NpuSysCtrl::GetInstance().Finalize();
diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp
index 4d6f9fe35ddbdb0ec30d8417055fae30de6432ee..576dfd5358ac9427e7be127a9a6f117e7115370f 100644
--- a/torch_npu/csrc/aten/common/CopyKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernel.cpp
@@ -23,6 +23,7 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/framework/utils/OpTemplate.h"
+#include "torch_npu/csrc/framework/graph/util/GraphModeGuard.h"
 #include "torch_npu/csrc/aten/common/FormatCastHelper.h"
 #include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h"
 #include "torch_npu/csrc/framework/allocator/THNPUCachingHostAllocator.h"
@@ -326,6 +327,7 @@ at::Tensor& NPUNativeFunctions::copy_(at::Tensor& self, const at::Tensor& src, b
     }
   } else {
     if (src.is_npu()) {
+      GraphModeGuard mode_guard(c10_npu::ModeKind::SINGLE_OP_MODE);
       copy_d2h(self, src, non_blocking);
     }
   }
diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
index 13d82c3f5f83c0720db21367ce6e089248f36b87..3934fc6cd597437ccd2d7944fc3085a5cbad50cd 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
@@ -16,6 +16,7 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/aten/common/FormatCastHelper.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/npu/NPURunMode.h"
 
 namespace at_npu {
 namespace native {
@@ -56,7 +57,26 @@ bool FormatCastHelper::format_cast_between_group(at::Tensor& dst, const at::Tens
       auto src_base_format = FormatHelper::GetBaseFormat(src);
       format_cast_as_base_format(src, FormatHelper::GetBaseFormat(dst)); // prepare: covert src to dst base format
       format_cast_inside_group(dst, src); // src base format (src format) -> dst base format
-      format_cast_as_base_format(src, src_base_format); // recover: dst base format -> dst format
+
+      // NB
+      // In Graph Mode
+      // a = torch.empty([2,3]).npu()
+      // a.npu_format_cast(nc1hwc0);
+      // a.npu_format_cast(nz);
+      // torch.npu.launch_graph()
+
+      // a base format change: ND-> NCHW -> ND
+      // when we run graph,
+      // FE get task : ND/ND -> NCHW/NC1HWC0, which will be failed
+      // so we judge condition below make a base format change become
+      // ND->NCHW->NCHW
+      // then FE get task : NCHW/NCHW -> NCHW/NC1HWC0 and NCHW/NCHW -> NCHW/NZ
+
+      if (c10_npu::NpuRunMode::IsGraphMode() && src_base_format == ACL_FORMAT_ND) {
+        return true;
+      }
+      // recover: dst base format -> dst format
+      format_cast_as_base_format(src, src_base_format);
       return true;
     }
   } else {
diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
index 8d006e1055fe526c3dca2e807e8458fca4b68c68..a547c4a3033c4c50877c1f16181a4d1f61f5a5f1 100644
--- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
+++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
@@ -17,7 +17,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <c10/npu/NPUStream.h>
-
+#include "torch_npu/csrc/framework/graph/util/GraphModeGuard.h"
 #include "third_party/acl/inc/acl/acl_base.h"
 #include "third_party/acl/inc/acl/acl_rt.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
@@ -33,6 +33,7 @@ c10::Scalar NPUNativeFunctions::_local_scalar_dense(const at::Tensor& self) {
       self.scalar_type(),
       "_local_scalar_dense_npu",
       [&] {
+        GraphModeGuard mode_guard(c10_npu::ModeKind::SINGLE_OP_MODE);
         scalar_t value = 0;
         c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream();
         aclError error = aclrtMemcpyAsync(
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index 26821c4f374d02a475e74338eba6009876f678bf..98d9adbfc13135c284737c65374773158981a610 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -29,6 +29,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NamedTensorUtils.h>
 #include <c10/util/Exception.h>
+#include <c10/npu/NPUGraphContextManager.h>
 #include <ATen/record_function.h>
 
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
@@ -97,6 +98,15 @@ namespace at_npu
           true);
       auto tensor =
           at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
+
+      // NB
+      // Store weak intrusive ptr of storage impl in both graph mode and single op mode
+      // because we need to get all live tensor in context in mode change scene
+      // we want to manage all storage without affect their life cycle
+      // so in graph mode, we can get all live tensor storage
+      c10::npu::graph::NpuGraphContextManager::GetInstance().AddOutputStorage(
+          storage_impl);
+
       // Default at::TensorImpl has size [0]
       if (size.size() != 1 || size[0] != 0)
       {
@@ -278,7 +288,12 @@ namespace at_npu
       aclFormat format = InferFormat::GuessStorageFormat(size, (aclFormat)dst_format);
       int64_t nelements = StorageDescHelper::GetMemorySize(size, format);
       auto dtype = c10::scalarTypeToTypeMeta(dtype_or_default(dtype_opt));
-      int64_t size_bytes = nelements * dtype.itemsize();
+
+      // In graph mode, empty with format is used to make inner tensor,
+      // ASCEND-GE will take charge of the memory of them
+      int64_t size_bytes =
+          c10_npu::NpuRunMode::IsGraphMode() ? 0 : nelements * dtype.itemsize();
+
       auto storage_impl = c10::make_intrusive<c10::StorageImpl>(
           c10::StorageImpl::use_byte_size_t(),
           size_bytes,
@@ -287,6 +302,12 @@ namespace at_npu
           true);
       auto tensor =
           at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
+
+      // NB Store weak intrusive ptr of storage impl in graph mode
+      // see note above
+      c10::npu::graph::NpuGraphContextManager::GetInstance().AddOutputStorage(
+          storage_impl);
+
       // Default NPUTensorImpl has size [0]
       if (size.size() != 1 || size[0] != 0)
       {
@@ -312,7 +333,11 @@ namespace at_npu
       aclFormat format = InferFormat::GuessStorageFormat(size, (aclFormat)dst_format);
       int64_t nelements = StorageDescHelper::GetMemorySize(size, format);
       auto dtype = options.dtype();
-      int64_t size_bytes = nelements * dtype.itemsize();
+      // In graph mode, empty with format is used to make inner tensor,
+      // ASCEND-GE will take charge of the memory of them
+      auto size_bytes =
+          c10_npu::NpuRunMode::IsGraphMode() ? 0 : nelements * dtype.itemsize();
+
       auto storage_impl = c10::make_intrusive<c10::StorageImpl>(
           c10::StorageImpl::use_byte_size_t(),
           size_bytes,
@@ -321,6 +346,12 @@ namespace at_npu
           true);
       auto tensor =
           at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
+
+      // NB Store weak intrusive ptr of storage impl in graph mode
+      // see note above
+      c10::npu::graph::NpuGraphContextManager::GetInstance().AddOutputStorage(
+          storage_impl);
+
       // Default at::TensorImpl has size [0]
       if (size.size() != 1 || size[0] != 0)
       {
diff --git a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp
index ac699de78356a8674da5a4595070295cdc233827..152e69c1386afc9f15e0eab47e6d7624b42584a3 100644
--- a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp
+++ b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp
@@ -31,6 +31,10 @@ MemOverlap has_internal_overlap(at::TensorImpl* t) {
     return MemOverlap::NO;
   }
 
+  if (t->storage().data() == nullptr) {
+    return MemOverlap::IS_NULL;
+  }
+
   auto strides = t->strides();
   auto sizes = t->sizes();
   for (size_t i = 0; i < strides.size(); ++i) {
@@ -59,6 +63,9 @@ MemOverlapStatus get_overlap_status(const at::Tensor& a, const at::Tensor& b) {
 
 MemOverlapStatus get_overlap_status(at::TensorImpl* a, at::TensorImpl* b) {
   if (a == b) return MemOverlapStatus::FULL;
+  if (a->storage().data() == nullptr || b->storage().data() == nullptr) {
+    return MemOverlapStatus::IS_NULL;
+  }
   if (a->numel() == 0 || b->numel() == 0) {
     return MemOverlapStatus::NO;
   }
diff --git a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h
index 93c57359f2b86e1c82fa3358e97e0138df321749..dd655d49f9c97fb2b744640288b662683a527f95 100644
--- a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h
+++ b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h
@@ -25,11 +25,12 @@ namespace at_npu { namespace native {
 //
 // NO: Absolutely no memory overlap
 // YES: Absolutely yes memory overlap
-// TOO_HARD: There might be memory overlap, but it was too expensive to compute.
+// TOO_HARD: There might be memory overlap, but it was too expensive to compute
+// IS_NULL: In npu graph mode, some tensors have no device ptr.
 //
 // NB: Please update the python test for these if you renumber them.
-enum class MemOverlap { NO, YES, TOO_HARD };
-enum class MemOverlapStatus { FULL, PARTIAL, NO, TOO_HARD };
+enum class MemOverlap { NO, YES, TOO_HARD, IS_NULL };
+enum class MemOverlapStatus { FULL, PARTIAL, NO, TOO_HARD, IS_NULL };
 
 MemOverlap has_internal_overlap(const at::Tensor& t);
 MemOverlap has_internal_overlap(at::TensorImpl* t);
diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dBackwardKernelNpu.cpp
index 75775ef0b4a04d41327fa5cba5441efb3c5903de..5ba51d951e6de1fc47a184752b8ab28bc5adf1e5 100644
--- a/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dBackwardKernelNpu.cpp
@@ -59,7 +59,7 @@ at::Tensor& NPUNativeFunctions::adaptive_max_pool2d_backward_out(
       cmd.Name("MaxPoolGradWithArgmaxV1")
           .Input(self)
           .Input(grad_output)
-          .Input(indices, "", "uint16")
+          .Input(indices, "", c10::nullopt, "uint16")
           .Output(grad_input)
           .Attr("ksize", kernelSize)
           .Attr("strides", stridesSize)
diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dKernelNpu.cpp
index 3c5b9ba028811b36e82ef7fa9dd43ff9e6102be7..64bd6cd9e3e6cdf6bed6091605c461d2ad4d6300 100644
--- a/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dKernelNpu.cpp
@@ -57,7 +57,7 @@ tuple<at::Tensor&, at::Tensor&> NPUNativeFunctions::adaptive_max_pool2d_out(
       cmd.Name("MaxPoolWithArgmaxV1")
           .Input(self)
           .Output(output)
-          .Output(indices,"uint16")
+          .Output(indices, "", c10::nullopt, "uint16")
           .Attr("ksize", kernelSize)
           .Attr("strides", stridesSize)
           .Attr("pads", paddings)
diff --git a/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesBackwardKernelNpu.cpp
index 6916db1ba8a8c363943e82581f13ca8b406c83d1..1e3b7dbc1ed7d34a3feef085f50cc628890fd29a 100644
--- a/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesBackwardKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesBackwardKernelNpu.cpp
@@ -52,7 +52,7 @@ at::Tensor& NPUNativeFunctions::max_pool2d_with_indices_backward_out(
   cmd.Name("MaxPoolGradWithArgmaxV1")
       .Input(self)
       .Input(grad_output)
-      .Input(indices, "", "uint16")
+      .Input(indices, "", c10::nullopt, "uint16")
       .Output(grad_input)
       .Attr("ksize", kernelSize)
       .Attr("strides", stridesSize)
diff --git a/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesKernelNpu.cpp
index 8aa842bc37e2050ff986320a85c920db8bc2196e..6be2e061bd7107bf954eeb65b9f1beec38552ee9 100644
--- a/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesKernelNpu.cpp
@@ -51,7 +51,7 @@ tuple<at::Tensor&, at::Tensor&> NPUNativeFunctions::max_pool2d_with_indices_out(
   cmd.Name("MaxPoolWithArgmaxV1")
       .Input(self)
       .Output(output)
-      .Output(indices, "uint16")
+      .Output(indices, "", c10::nullopt, "uint16")
       .Attr("ksize", kernelSize)
       .Attr("strides", stridesSize)
       .Attr("pads", paddings)
diff --git a/torch_npu/csrc/core/npu/NPURunMode.cpp b/torch_npu/csrc/core/npu/NPURunMode.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b1d6154981a030c63f6021eaf9ad1410259a5cd
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPURunMode.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "NPURunMode.h"
+
+namespace c10_npu {
+ModeKind NpuRunMode::cur_mode_ = ModeKind::DEFAULT_MODE;
+
+void NpuRunMode::SetNpuRunMode(const ModeKind &mode) {
+  cur_mode_ = mode;
+  return;
+}
+
+ModeKind NpuRunMode::CurRunMode() {
+  return cur_mode_;
+}
+
+bool NpuRunMode::IsGraphMode() {
+  return cur_mode_ == ModeKind::GRAPH_MODE;
+}
+} // namespace c10
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPURunMode.h b/torch_npu/csrc/core/npu/NPURunMode.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc21a1dcb8276600f0a1e2ced237e8a0bf7530da
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPURunMode.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <c10/macros/Export.h>
+
+#include <string>
+
+namespace c10_npu {
+enum class ModeKind : uint8_t {
+  DEFAULT_MODE = 0,
+  SINGLE_OP_MODE = DEFAULT_MODE,
+  GRAPH_MODE,
+};
+
+class TORCH_API NpuRunMode{
+public:
+  static void SetNpuRunMode(const ModeKind& mode);
+  static ModeKind CurRunMode();
+  static bool IsGraphMode();
+
+private:
+  static ModeKind cur_mode_;
+};
+} // namespace c10
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/OpCommandBase.h b/torch_npu/csrc/framework/OpCommandBase.h
index bc7f4368f4c49be793183d209494aa9bb4752b2a..f0a873f9c20142d9e569c52f2f3dd9a6644d032c 100644
--- a/torch_npu/csrc/framework/OpCommandBase.h
+++ b/torch_npu/csrc/framework/OpCommandBase.h
@@ -22,289 +22,337 @@
 #include "torch_npu/csrc/framework/OpCmdHelper.h"
 #include "torch_npu/csrc/framework/OpParamMaker.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
+#include "torch_npu/csrc/framework/FormatHelper.h"
+#include "torch_npu/csrc/core/npu/NPURunMode.h"
 #include "torch_npu/csrc/framework/allocator/THNPUCachingHostAllocator.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/framework/graph/construct/GraphConstructor.h"
 
-namespace at_npu
-{
-  namespace native
-  {
+#define IF_GRAPH_MODE_THEN_RUN(...)            \
+  do {                                         \
+    if (c10_npu::NpuRunMode::IsGraphMode()) { \
+      __VA_ARGS__;                             \
+    }                                          \
+  } while (false);
 
-    // get common dtype and shape from op adapter layer
-    struct UnifiedResult
-    {
-      c10::optional<at::ScalarType> common_type = c10::nullopt;
-      c10::optional<c10::IntArrayRef> common_shape = c10::nullopt;
-      // judge result tensor's dtype is defined or not.
-      // if result's dtype is defined, result_type_defined is true and result's dtype remains unchanged.
-      bool result_type_defined = false;
-    };
+#define IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(...) \
+  do {                                            \
+    if (c10_npu::NpuRunMode::IsGraphMode()) {    \
+      __VA_ARGS__;                                \
+      return static_cast<Derived&>(*this);        \
+    }                                             \
+  } while (false);
 
-    template <class Derived>
-    class OpCommandBase
-    {
-    public:
-      explicit OpCommandBase()
-      {
-        aclCmds = OpCommandImpls::GetInstance();
-        aclCmds->Push(aclCmd);
-      }
-      virtual ~OpCommandBase() {}
+namespace at_npu {
+namespace native {
 
-      Derived &Name(string name)
-      {
-        aclCmd->SetName(name);
-        return static_cast<Derived &>(*this);
-      }
+// get common dtype and shape from op adapter layer
+struct UnifiedResult {
+  c10::optional<at::ScalarType> common_type = c10::nullopt;
+  c10::optional<c10::IntArrayRef> common_shape = c10::nullopt;
+  // judge result tensor's dtype is defined or not.
+  // if result's dtype is defined, result_type_defined is true and result's dtype remains unchanged.
+  bool result_type_defined = false;
+};
 
-      Derived &Expect(UnifiedResult unified_result)
-      {
-        commonType = unified_result.common_type;
-        resultTypeDefined = unified_result.result_type_defined;
-        commonShape = unified_result.common_shape;
-        return static_cast<Derived &>(*this);
-      }
+template<class Derived>
+class OpCommandBase {
+public:
+  OpCommandBase() {
+    IF_GRAPH_MODE_THEN_RUN(return;)
+    aclCmds = OpCommandImpls::GetInstance();
+    aclCmds->Push(aclCmd);
+  }
+  virtual ~OpCommandBase() {}
 
-      template <typename dataType>
-      Derived &Attr(string name, dataType value)
-      {
-        aclCmd->AddAttr(name, value);
-        return static_cast<Derived &>(*this);
-      }
+  OpCommandBase(const OpCommandBase &other) = delete;
+  OpCommandBase(OpCommandBase &&other) = delete;
+  OpCommandBase &operator=(const OpCommandBase &) = delete;
+  OpCommandBase &operator=(OpCommandBase &&) = delete;
 
-      Derived &InputWithoutContiguous(
-          const at::Tensor &input,
-          const string &descName = "",
-          const string &realData = "")
-      {
-        return AddTensorInput(const_cast<at::Tensor &>(input), at::ScalarType::Undefined, descName, realData);
-      }
+  Derived &Name(const string &name) {
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(graphCmd.SetName(name);)
+    aclCmd->SetName(name);
+    return static_cast<Derived &>(*this);
+  }
 
-      Derived &Input()
-      {
-        return AddNoneTensor();
-      }
+  Derived &DynamicInputReg(
+      c10::npu::graph::DynamicInputRegFunc func,
+      c10::npu::graph::DyNumAndIndex num_and_index) {
+    IF_GRAPH_MODE_THEN_RUN(
+        graphCmd.AddDynamicInputRegFunc(func, num_and_index);)
+    return static_cast<Derived &>(*this);
+  }
 
-      Derived &Input(
-          const at::Tensor &input,
-          const string &descName = "",
-          const string &realData = "")
-      {
-        return AddTensorInput(Contiguous(input), at::ScalarType::Undefined, descName, realData);
-      }
+  Derived &Expect(UnifiedResult unified_result) {
+    commonType = unified_result.common_type;
+    resultTypeDefined = unified_result.result_type_defined;
+    commonShape = unified_result.common_shape;
+    return static_cast<Derived &>(*this);
+  }
 
-      Derived &Input(
-          const at::Tensor &cpuTensor,
-          c10::SmallVector<int64_t, N> dimList,
-          const string &descName = "")
-      {
-        at::Tensor npuTensor = CopyHostToDevice(cpuTensor);
-        aclCmd->AddConst(dimList);
-        return AddTensorInput(npuTensor, at::ScalarType::Undefined, descName, "", cpuTensor);
-      }
+  template<typename dataType>
+  Derived &Attr(const string &name, dataType value) {
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(
+        graphCmd.AddAttr<dataType>(name, value);
+    )
+    aclCmd->AddAttr(name, value);
+    return static_cast<Derived &>(*this);
+  }
 
-      Derived &Input(c10::SmallVector<int64_t, N> &dimList,
-                     at::ScalarType toType = at::kLong)
-      {
+  Derived &Input() {
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(
+        graphCmd.AddInput();
+    )
+    return AddNoneTensor();
+  }
 
-        at::Tensor &cpuTensor = CreateHostTensor((void *)dimList.data(),
-                                                 dimList.size(),
-                                                 c10::TensorOptions(at::kCPU).dtype(at::kLong),
-                                                 toType);
-        return AddHostTensorInput(cpuTensor);
-      }
+  Derived &Input(
+      const at::Tensor &input,
+      const string &descName = "",
+      const c10::optional<aclFormat> &sensitive_format = c10::nullopt,
+      const string &realData = "") {
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(
+        auto contiguous_input = Contiguous(input);
+        if (commonType.has_value() &&
+            commonType.value() != contiguous_input.scalar_type()) {
+          contiguous_input = NPUNativeFunctions::npu_dtype_cast(contiguous_input, commonType.value());
+        }
+        graphCmd.AddInput(contiguous_input, descName, realData, sensitive_format);
+    )
+    return AddTensorInput(
+        Contiguous(input), c10::ScalarType::Undefined, descName, realData);
+  }
 
-      Derived &Input(c10::IntArrayRef &dimListRef,
-                     at::ScalarType toType = at::kLong)
-      {
+  Derived &InputWithoutContiguousGeneral(
+      const at::Tensor &input,
+      const string &descName = "",
+      const c10::optional<aclFormat> &sensitive_format = c10::nullopt,
+      const string &realData = "") {
+    return AddTensorInput(const_cast<at::Tensor &>(input), c10::ScalarType::Undefined, descName, realData);
+  }
 
-        at::Tensor &cpuTensor = CreateHostTensor((void *)dimListRef.data(),
-                                                 dimListRef.size(),
-                                                 c10::TensorOptions(at::kCPU).dtype(at::kLong),
-                                                 toType);
-        return AddHostTensorInput(cpuTensor);
-      }
+  Derived &InputWithoutContiguous(const at::Tensor &input,
+                                  const string &descName = "",
+                                  const string &realData = "") {
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(
+        graphCmd.AddInput(input, descName, realData);
+    )
+    if (input.storage_offset() != 0) {
+      NPU_LOGW(
+          "[Check][offset] Check input storage_offset[%ld] = 0 failed, result is untrustworthy",
+          input.storage_offset());
+    }
+    return AddTensorInput(const_cast<at::Tensor &>(input));
+  }
 
-      Derived &Input(const c10::Scalar &input, const at::ScalarType type,
-        CompileType compileType = CompileType::MEMORY_DEVICE_COMPILE)
-      {
-        if ((compileType == MEMORY_DEVICE_COMPILE) &&
-          (torch_npu::option::OptionsManager::CheckScalarToHostMemEnable())) {
-          compileType = MEMORY_HOST_COMPILE_INDEPENDENT;
-        }
-        if (compileType == CompileType::MEMORY_DEVICE_COMPILE)
-        {
-          return AddScalarInput(input, type);
-        }
-        else
-        {
-          auto scalarTensor = CreateScalarTensor(input, type);
-          return AddHostTensorInput(scalarTensor, compileType);
-        }
-      }
+  Derived &Input(
+      const at::Tensor &cpuTensor,
+      c10::SmallVector<int64_t, N> dimList,
+      const string &descName = "") {
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(
+        graphCmd.AddInput(dimList, cpuTensor.scalar_type());
+    )
+    at::Tensor npuTensor = CopyHostToDevice(cpuTensor);
+    aclCmd->AddConst(dimList);
+    return AddTensorInput(npuTensor, at::ScalarType::Undefined, descName, "", cpuTensor);
+  }
 
-      Derived &Output(at::Tensor &output, const string &realType = "")
-      {
-        return AddOutput(output, realType);
-      }
+  Derived &Input(c10::SmallVector<int64_t, N> &dimList,
+                 at::ScalarType toType = at::kLong) {
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(
+        graphCmd.AddInput(dimList, toType);
+    )
+    at::Tensor &cpuTensor = CreateHostTensor((void *) dimList.data(),
+                                             dimList.size(),
+                                             c10::TensorOptions(at::kCPU).dtype(at::kLong),
+                                             toType);
+    return AddHostTensorInput(cpuTensor);
+  }
 
-      void Run()
-      {
-        if (torch_npu::option::OptionsManager::CheckQueueEnable())
-        {
-          ExecuteParas params;
-          aclCmd->ExportParams(params);
-          c10::npu::enCurrentNPUStream(&params);
-          aclCmd->releaseSource(false);
-        }
-        else
-        {
-          aclCmd->Run();
-          aclCmd->releaseSource();
-        }
-        aclCmds->Pop();
-      }
+  Derived &Input(c10::IntArrayRef &dimListRef,
+                 at::ScalarType toType = at::kLong) {
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(
+        graphCmd.AddInput(dimListRef, toType);
+    )
+    at::Tensor &cpuTensor = CreateHostTensor((void *) dimListRef.data(),
+                                             dimListRef.size(),
+                                             c10::TensorOptions(at::kCPU).dtype(at::kLong),
+                                             toType);
+    return AddHostTensorInput(cpuTensor);
+  }
 
-    protected:
-      Derived &AddTensorInput(at::Tensor &tensor,
-                              at::ScalarType forceScaleType = at::ScalarType::Undefined,
-                              const string &descName = "", const string &realData = "",
-                              c10::optional<at::Tensor> cpu_tensor = c10::nullopt)
-      {
-        std::tuple<aclTensorDesc *, aclDataBuffer *, int64_t, aclFormat> res;
-        if (commonType.has_value() && commonType.value() != tensor.scalar_type())
-        {
-          tensor = NPUNativeFunctions::npu_dtype_cast(tensor, commonType.value());
-        }
-        // 针对dim=0的场景，绝对不会有输入为uint16的情况，因为这个是TBE引入的，TBE没有dim=0的情况
-        if (tensor.dim() == 0)
-        {
-          if (tensor.is_npu())
-          {
-            res = OpCmdHelper::CovertNPUTensorWithZeroDimToAclInput(tensor, descName);
-          }
-          else
-          {
-            res = OpCmdHelper::CovertTensorWithZeroDimToAclInput(tensor, forceScaleType);
-          }
-        }
-        else
-        {
-          res = OpCmdHelper::CovertTensorToAclInput(tensor, cpu_tensor, descName, realData);
-        }
-        aclCmd->AddInput(
-            std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res));
-        return static_cast<Derived &>(*this);
-      }
-      Derived &AddHostTensorInput(const at::Tensor &tensor,
-        CompileType compileType = CompileType::MEMORY_HOST_COMPILE_DEPENDENT)
-      {
-        std::tuple<aclTensorDesc *, aclDataBuffer *, int64_t, aclFormat> res;
-        res = OpCmdHelper::CovertHostTensorToAclInput(tensor, tensor.scalar_type(), compileType);
-        aclCmd->AddInput(
-            std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res), tensor);
-        return static_cast<Derived &>(*this);
-      }
-      Derived &AddNoneTensor()
-      {
-        AclTensorDescMaker desc;
-        auto aclDesc = desc.Create(ACL_DT_UNDEFINED, ACL_FORMAT_UNDEFINED).Get();
-        AclTensorBufferMaker buffer(nullptr, 0);
-        aclCmd->AddInput(aclDesc, buffer.Get(), 0, ACL_FORMAT_UNDEFINED);
-        return static_cast<Derived &>(*this);
-      }
-      Derived &AddScalarInput(const c10::Scalar &input,
-                              at::ScalarType type)
-      {
-        at::ScalarType type_bk = type;
-        if (commonType.has_value())
-        {
-          type_bk = commonType.value();
+  Derived &Input(const c10::Scalar &input, const at::ScalarType type,
+                 CompileType compileType = CompileType::MEMORY_DEVICE_COMPILE) {
+    if ((compileType == MEMORY_DEVICE_COMPILE) &&
+        (torch_npu::option::OptionsManager::CheckScalarToHostMemEnable())) {
+      compileType = MEMORY_HOST_COMPILE_INDEPENDENT;
+    }
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(
+        auto true_type = commonType.has_value() ? commonType.value() : type;
+        graphCmd.AddInput(input, true_type, compileType);
+    )
+    if (compileType == CompileType::MEMORY_DEVICE_COMPILE) {
+      return AddScalarInput(input, type);
+    } else {
+      auto scalarTensor = CreateScalarTensor(input, type);
+      return AddHostTensorInput(scalarTensor, compileType);
+    }
+  }
+
+  Derived &Output(
+      at::Tensor &output,
+      const string &descName = "",
+      const c10::optional<aclFormat> &sensitive_format = c10::nullopt,
+      const string &realType = "") {
+    IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(
+        if (sensitive_format.has_value() &&
+            FormatHelper::GetBaseFormat(output) != sensitive_format.value()) {
+          output = NPUNativeFunctions::npu_format_cast(output, sensitive_format.value());
         }
-        at::Tensor aclInput = CopyHostToDevice(input, type_bk);
-        auto res = OpCmdHelper::CovertScalarToAclInput(aclInput, type_bk);
-        aclCmd->AddInput(
-            std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res));
-        return static_cast<Derived &>(*this);
-      }
-      Derived &AddOutput(at::Tensor &output, const string &realType = "")
-      {
-        if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type())
-        {
+        graphCmd.AddOutput(output, descName, realType, sensitive_format);
+        if (!resultTypeDefined && commonType.has_value() &&
+            output.scalar_type() != commonType.value()) {
           output = NPUNativeFunctions::npu_dtype_cast(output, commonType.value());
         }
-        const at::Tensor *tensor = &output;
-        auto res = OpCmdHelper::CovertToAclOutput(tensor, realType);
-        aclCmd->AddOutput(
-            std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res));
-        return static_cast<Derived &>(*this);
-      }
+    )
+    return AddOutput(output, realType);
+  }
 
-    protected:
-      // 由于format_contiguous会生成新Tensor，为了保证其在生命周期内有效，故而放到对象中存储
-      // 同下，CopyScalarToDevice也有同样问题
-      at::Tensor &Contiguous(const at::Tensor &input)
-      {
-        storage.emplace_back(NpuUtils::format_contiguous_add_copy_optimize(input));
-        return storage.back();
-      }
-      at::Tensor CopyHostToDevice(const c10::Scalar &scalar, at::ScalarType type)
-      {
-        auto tensor = scalar_to_tensor(scalar).to(type);
-        return CopyHostToDevice(tensor);
-      }
-      at::Tensor CopyHostToDevice(const at::Tensor &cpuTensor)
-      {
-        at::Tensor cpuPinMemTensor = cpuTensor.pin_memory();
-        int deviceIndex = 0;
-        AT_NPU_CHECK(aclrtGetDevice(&deviceIndex));
-        auto tensor = cpuPinMemTensor.to(
-            c10::Device(c10::DeviceType::NPU, deviceIndex),
-            cpuPinMemTensor.scalar_type(),
-            true,
-            true);
-        storage.emplace_back(tensor);
-        return storage.back();
+  void Run() {
+    IF_GRAPH_MODE_THEN_RUN(return;)
+    if (torch_npu::option::OptionsManager::CheckQueueEnable()) {
+      ExecuteParas params;
+      aclCmd->ExportParams(params);
+      c10::npu::enCurrentNPUStream(&params);
+      aclCmd->releaseSource(false);
+    } else {
+      aclCmd->Run();
+      aclCmd->releaseSource();
+    }
+    aclCmds->Pop();
+  }
+
+protected:
+  Derived &AddTensorInput(at::Tensor &tensor,
+                          at::ScalarType forceScaleType = at::ScalarType::Undefined,
+                          const string &descName = "", const string &realData = "",
+                          c10::optional<at::Tensor> cpu_tensor = c10::nullopt) {
+    std::tuple < aclTensorDesc * , aclDataBuffer *, int64_t, aclFormat > res;
+    if (commonType.has_value() && commonType.value() != tensor.scalar_type()) {
+      tensor = NPUNativeFunctions::npu_dtype_cast(tensor, commonType.value());
+    }
+    // 针对dim=0的场景，绝对不会有输入为uint16的情况，因为这个是TBE引入的，TBE没有dim=0的情况
+    if (tensor.dim() == 0) {
+      if (tensor.is_npu()) {
+        res = OpCmdHelper::CovertNPUTensorWithZeroDimToAclInput(tensor, descName);
+      } else {
+        res = OpCmdHelper::CovertTensorWithZeroDimToAclInput(tensor, forceScaleType);
       }
+    } else {
+      res = OpCmdHelper::CovertTensorToAclInput(tensor, cpu_tensor, descName, realData);
+    }
+    aclCmd->AddInput(
+        std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res));
+    return static_cast<Derived &>(*this);
+  }
+  Derived &AddHostTensorInput(const at::Tensor &tensor,
+                              CompileType compileType = CompileType::MEMORY_HOST_COMPILE_DEPENDENT) {
+    std::tuple < aclTensorDesc * , aclDataBuffer *, int64_t, aclFormat > res;
+    res = OpCmdHelper::CovertHostTensorToAclInput(tensor, tensor.scalar_type(), compileType);
+    aclCmd->AddInput(
+        std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res), tensor);
+    return static_cast<Derived &>(*this);
+  }
+  Derived &AddNoneTensor() {
+    AclTensorDescMaker desc;
+    auto aclDesc = desc.Create(ACL_DT_UNDEFINED, ACL_FORMAT_UNDEFINED).Get();
+    AclTensorBufferMaker buffer(nullptr, 0);
+    aclCmd->AddInput(aclDesc, buffer.Get(), 0, ACL_FORMAT_UNDEFINED);
+    return static_cast<Derived &>(*this);
+  }
+  Derived &AddScalarInput(const c10::Scalar &input,
+                          at::ScalarType type) {
+    at::ScalarType type_bk = type;
+    if (commonType.has_value()) {
+      type_bk = commonType.value();
+    }
+    at::Tensor aclInput = CopyHostToDevice(input, type_bk);
+    auto res = OpCmdHelper::CovertScalarToAclInput(aclInput, type_bk);
+    aclCmd->AddInput(
+        std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res));
+    return static_cast<Derived &>(*this);
+  }
+  Derived &AddOutput(at::Tensor &output, const string &realType = "") {
+    if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type()) {
+      output = NPUNativeFunctions::npu_dtype_cast(output, commonType.value());
+    }
+    const at::Tensor *tensor = &output;
+    auto res = OpCmdHelper::CovertToAclOutput(tensor, realType);
+    aclCmd->AddOutput(
+        std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res));
+    return static_cast<Derived &>(*this);
+  }
 
-      at::Tensor &CreateHostTensor(void *data, size_t size,
-                                   const c10::TensorOptions &options, at::ScalarType toType)
-      {
+protected:
+  // 由于format_contiguous会生成新Tensor，为了保证其在生命周期内有效，故而放到对象中存储
+  // 同下，CopyScalarToDevice也有同样问题
+  at::Tensor &Contiguous(const at::Tensor &input) {
+    storage.emplace_back(NpuUtils::format_contiguous_add_copy_optimize(input));
+    return storage.back();
+  }
+  at::Tensor CopyHostToDevice(const c10::Scalar &scalar, at::ScalarType type) {
+    auto tensor = scalar_to_tensor(scalar).to(type);
+    return CopyHostToDevice(tensor);
+  }
+  at::Tensor CopyHostToDevice(const at::Tensor &cpuTensor) {
+    at::Tensor cpuPinMemTensor = cpuTensor.pin_memory();
+    int deviceIndex = 0;
+    AT_NPU_CHECK(aclrtGetDevice(&deviceIndex));
+    auto tensor = cpuPinMemTensor.to(
+        c10::Device(c10::DeviceType::NPU, deviceIndex),
+        cpuPinMemTensor.scalar_type(),
+        true,
+        true);
+    storage.emplace_back(tensor);
+    return storage.back();
+  }
 
-        AT_ASSERT(options.dtype() == at::kLong);
-        auto cpuTensor = at::empty(size, options);
-        AT_ASSERT(cpuTensor.is_contiguous());
-        std::memcpy(cpuTensor.data_ptr(), data, sizeof(int64_t) * cpuTensor.numel());
-        if (toType != at::kLong)
-        {
-          cpuTensor = cpuTensor.to(toType);
-        }
+  at::Tensor &CreateHostTensor(void *data, size_t size,
+                               const c10::TensorOptions &options, at::ScalarType toType) {
 
-        storage.emplace_back(std::move(cpuTensor));
-        return storage.back();
-      }
-      at::Tensor CreateScalarTensor(const c10::Scalar &scalar, at::ScalarType type)
-      {
-        if (commonType.has_value()) {
-          type = commonType.value();
-        }
-        storage.emplace_back(scalar_to_tensor(scalar).to(type));
-        return storage.back();
-      }
-      c10::SmallVector<at::Tensor, N> storage; // tensor's life cycle should maintain when Run() is called
+    AT_ASSERT(options.dtype() == at::kLong);
+    auto cpuTensor = at::empty(size, options);
+    AT_ASSERT(cpuTensor.is_contiguous());
+    std::memcpy(cpuTensor.data_ptr(), data, sizeof(int64_t) * cpuTensor.numel());
+    if (toType != at::kLong) {
+      cpuTensor = cpuTensor.to(toType);
+    }
 
-    protected:
-      OpCommandImpls *aclCmds = nullptr; // owned
-      OpCommandImpl *aclCmd = nullptr;
+    storage.emplace_back(std::move(cpuTensor));
+    return storage.back();
+  }
+  at::Tensor CreateScalarTensor(const c10::Scalar &scalar, at::ScalarType type) {
+    if (commonType.has_value()) {
+      type = commonType.value();
+    }
+    storage.emplace_back(scalar_to_tensor(scalar).to(type));
+    return storage.back();
+  }
+  c10::SmallVector<at::Tensor, N> storage; // tensor's life cycle should maintain when Run() is called
 
-    private:
-      c10::optional<at::ScalarType> commonType = c10::nullopt;
-      c10::optional<c10::IntArrayRef> commonShape = c10::nullopt;
-      bool resultTypeDefined = false;
+protected:
+  OpCommandImpls *aclCmds = nullptr; // owned
+  OpCommandImpl *aclCmd = nullptr;
+  GraphCommandImpl graphCmd;
 
-    }; // class OpCommandBase
+private:
+  c10::optional<at::ScalarType> commonType = c10::nullopt;
+  c10::optional<c10::IntArrayRef> commonShape = c10::nullopt;
+  bool resultTypeDefined = false;
 
-  } // namespace native
+}; // class OpCommandBase
+} // namespace native
 } // namespace at_npu
 
 #endif
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h
index 92633ab259150c3264b0988b5819cf0d720de926..4711a9d5f6a1739ef44fb828430b19920e3fd386 100644
--- a/torch_npu/csrc/framework/OpParamMaker.h
+++ b/torch_npu/csrc/framework/OpParamMaker.h
@@ -216,7 +216,7 @@ namespace at_npu
         // queue-enable
       }
 
-      void SetName(string &name)
+      void SetName(const string &name)
       {
         opName = name;
       }
diff --git a/torch_npu/csrc/framework/graph/cache/GraphCacher.cpp b/torch_npu/csrc/framework/graph/cache/GraphCacher.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..16cadd4b59817f10355a22ac503dcb2cc54b0f18
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/cache/GraphCacher.cpp
@@ -0,0 +1,78 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "GraphCacher.h"
+
+namespace at_npu {
+namespace native {
+hash_t GraphCache::GetGraphTopoHash(
+    const std::vector<hash_t>& inputs_topo_hash,
+    const std::vector<hash_t>& outputs_topo_hash) {
+  hash_t graph_topo_hash = multi_hash(inputs_topo_hash);
+  graph_topo_hash = multi_hash(graph_topo_hash, outputs_topo_hash);
+  return graph_topo_hash;
+}
+
+hash_t GraphCache::GetGraphShapeHash(
+    const std::vector<hash_t>& inputs_shape_hash,
+    const std::vector<hash_t>& outputs_shape_hash) {
+  hash_t graph_shape_hash = multi_hash(inputs_shape_hash);
+  graph_shape_hash = multi_hash(graph_shape_hash, outputs_shape_hash);
+  return graph_shape_hash;
+}
+
+hash_t GraphCache::GetTensorShapeHash(
+    const hash_t& topo_hash,
+    const ge::TensorDesc& tensor_desc) {
+  return multi_hash(
+      topo_hash,
+      tensor_desc.GetOriginShape().GetDimNum(),
+      tensor_desc.GetOriginShape().GetDims());
+}
+
+hash_t GraphCache::GetTensorTopoHash(
+    const Value& graph_value,
+    const ge::TensorDesc& tensor_desc) {
+  return multi_hash(
+      graph_value.GetValueHash(),
+      tensor_desc.GetDataType(),
+      tensor_desc.GetOriginFormat(),
+      tensor_desc.GetFormat());
+}
+
+c10::optional<uint32_t> GraphCache::GetCacheGraphId(
+    const std::vector<hash_t>& inputs_topo_hash,
+    const std::vector<hash_t>& inputs_shape_hash,
+    const std::vector<hash_t>& outputs_topo_hash,
+    const std::vector<hash_t>& outputs_shape_hash,
+    uint32_t cur_graph_id) {
+  hash_t topo_hash = GetGraphTopoHash(inputs_topo_hash, outputs_topo_hash);
+  hash_t shape_hash = GetGraphShapeHash(inputs_shape_hash, outputs_shape_hash);
+  auto iter = graph_cache_.find(topo_hash);
+  if (iter != graph_cache_.end()) {
+    auto& shape_map = iter->second;
+    auto shape_iter = shape_map.find(shape_hash);
+    if (shape_iter != shape_map.end()) {
+      return shape_iter->second;
+    } else {
+      shape_map[shape_hash] = cur_graph_id;
+    }
+  } else {
+    graph_cache_[topo_hash] = {{shape_hash, cur_graph_id}};
+  }
+  return c10::nullopt;
+}
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/graph/cache/GraphCacher.h b/torch_npu/csrc/framework/graph/cache/GraphCacher.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e030beac772699fd65d6f719d4b0621d5af4d15
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/cache/GraphCacher.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <c10/npu/NPUGraph.h>
+#include <c10/npu/NPUHashUtils.h>
+#include <third_party/acl/inc/graph/tensor.h>
+
+#include <unordered_map>
+
+namespace at_npu {
+namespace native {
+
+using c10::npu::graph::Value;
+using c10::npu::hash_utils::hash_t;
+using c10::npu::hash_utils::multi_hash;
+
+class GraphCache {
+public:
+  c10::optional<uint32_t> GetCacheGraphId(
+      const std::vector<hash_t>& inputs_topo_hash,
+      const std::vector<hash_t>& inputs_shape_hash,
+      const std::vector<hash_t>& outputs_topo_hash,
+      const std::vector<hash_t>& outputs_shape_hash,
+      uint32_t cur_graph_id);
+
+  static hash_t GetTensorTopoHash(
+      const Value& graph_value,
+      const ge::TensorDesc& tensor_desc);
+
+  static hash_t GetTensorShapeHash(
+      const hash_t& topo_hash,
+      const ge::TensorDesc& tensor_desc);
+
+private:
+  static hash_t GetGraphTopoHash(
+      const std::vector<hash_t>& inputs_topo_hash,
+      const std::vector<hash_t>& outputs_topo_hash);
+
+  static hash_t GetGraphShapeHash(
+      const std::vector<hash_t>& inputs_shape_hash,
+      const std::vector<hash_t>& outputs_shape_hash);
+
+  std::unordered_map<hash_t, std::unordered_map<hash_t, uint32_t>> graph_cache_;
+};
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/framework/graph/construct/GraphConstructor.cpp b/torch_npu/csrc/framework/graph/construct/GraphConstructor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24da178417b40ec9325eab59a271f07683d45860
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/construct/GraphConstructor.cpp
@@ -0,0 +1,190 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "GraphConstructor.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/framework/graph/util/GraphUtils.h"
+#include "torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h"
+
+namespace at_npu {
+namespace native {
+using c10::npu::graph::NodeExtInfoType;
+void GraphCommandImpl::SetName(const std::string& name) {
+  ir_node_ = std::make_shared<c10::npu::graph::Node>(name);
+}
+
+void GraphCommandImpl::AddInput() {
+  ++input_index_;
+}
+
+void GraphCommandImpl::AddInput(
+    const at::Tensor& input,
+    const string& desc_name,
+    const string& real_dtype,
+    const c10::optional<aclFormat>& sensitive_format) {
+  if (input.dim() == 0 && !input.is_npu()) {
+    return AddZeroDimInput(input, desc_name);
+  }
+  if (GraphUtils::IsTensorWithoutNode(input)) {
+    if (!input.storage().data()) {
+      auto storage_impl = input.storage().unsafeGetStorageImpl();
+      size_t n_bytes = storage_impl->nbytes();
+      auto data_ptr = c10_npu::NPUCachingAllocator::get()->allocate(n_bytes);
+      storage_impl->set_data_ptr(std::move(data_ptr));
+    }
+    GraphUtils::SetDataOp(input.storage().unsafeGetStorageImpl());
+  }
+  if (GraphUtils::IsDataTensor(input)) {
+    GraphUtils::RetainGraphDataTensor(input);
+  }
+  if (sensitive_format.has_value()) {
+    ir_node_->AddExtInfo(
+        NodeExtInfoType::SENSITIVE_FORMAT_OF_INPUT,
+        std::make_pair(desc_name, sensitive_format.value()));
+  }
+
+  auto& cur_ir_value = GraphUtils::GetTensorIrValue(input);
+  if (!real_dtype.empty()) {
+    cur_ir_value.SetRealType(real_dtype);
+  }
+  ir_node_->AddInput(
+      input_index_++, cur_ir_value.GetCurNode(), cur_ir_value.GetValueIndex());
+  ir_node_->UpdateNodeHash(GraphUtils::GetTensorIrValueHash(input), real_dtype);
+}
+
+void GraphCommandImpl::AddInput(
+    const at::Scalar& input,
+    const at::ScalarType type,
+    CompileType compile_type) {
+  if (compile_type == CompileType::MEMORY_HOST_COMPILE_INDEPENDENT) {
+    uint32_t offset;
+    ReduceScalarValue(input, type, offset);
+    int deviceIndex = 0;
+    AT_NPU_CHECK(aclrtGetDevice(&deviceIndex));
+    auto npu_scalar_tensor = at::empty({}, at::TensorOptions(at::kNPU, deviceIndex).dtype(type));
+    GraphUtils::SetDataOp(npu_scalar_tensor.storage().unsafeGetStorageImpl());
+    GraphUtils::RetainGraphDataTensor(npu_scalar_tensor);
+    auto& cur_ir_value = GraphUtils::GetTensorIrValue(npu_scalar_tensor);
+    cur_ir_value.SetScalarMemOffset(offset);
+    ir_node_->AddInput(
+        input_index_++, cur_ir_value.GetCurNode(), cur_ir_value.GetValueIndex());
+    ir_node_->UpdateNodeHash(GraphUtils::GetTensorIrValueHash(npu_scalar_tensor));
+  } else {
+    ir_node_->AddExtInfo(
+        NodeExtInfoType::INPUT_TYPE_SCALAR,
+        std::make_tuple(input_index_++, input, type));
+    ir_node_->UpdateNodeHash(CalcuOpUtil::get_scalar_float_value(input), type);
+  }
+}
+
+void GraphCommandImpl::AddInput(
+    const c10::IntArrayRef& dim_list,
+    const at::ScalarType to_type) {
+  vector<int64_t> val(dim_list.begin(), dim_list.end());
+  ir_node_->AddExtInfo(
+      NodeExtInfoType::INPUT_TYPE_LIST_LONG,
+      std::make_tuple(input_index_++, std::move(val), to_type));
+  ir_node_->UpdateNodeHash(dim_list, to_type);
+}
+
+void GraphCommandImpl::AddOutput(
+    const at::Tensor& output,
+    const string& desc_name,
+    const string& real_type,
+    const c10::optional<aclFormat>& sensitive_format) {
+  if (sensitive_format.has_value()) {
+    ir_node_->AddExtInfo(
+        NodeExtInfoType::SENSITIVE_FORMAT_OF_OUTPUT,
+        std::make_pair(desc_name, sensitive_format.value()));
+  }
+  if (!ir_node_->GetInputs().empty() || output_index_ != 0) {
+    Value value{ir_node_, output_index_++};
+    if (!real_type.empty()) {
+      value.SetRealType(real_type);
+    }
+    GraphUtils::SetTensorIrValue(output, value);
+  } else {
+    // op without input and has outputs should be treated as graph input
+    GraphUtils::SetTensorIrValue(
+        output, Value(ir_node_, ir_node_, output_index_++));
+    GraphUtils::RetainGraphDataTensor(output);
+  }
+}
+
+void GraphCommandImpl::AddDynamicInputRegFunc(
+    DynamicInputRegFunc func,
+    DyNumAndIndex num_and_index) {
+  ir_node_->AddExtInfo(
+      NodeExtInfoType::DYNAMIC_INPUT_FUNC, std::make_pair(func, num_and_index));
+}
+
+void GraphCommandImpl::ReduceScalarValue(
+    const at::Scalar& input,
+    const at::ScalarType type,
+    uint32_t& host_ptr_offset) {
+  if (at::ScalarType::Float == type) {
+    float value = input.toFloat();
+    ScalarMemContext::GetContext().AppendToHostMem(
+        reinterpret_cast<uint8_t*>(&value),
+        sizeof(float),
+        host_ptr_offset);
+  } else if (at::ScalarType::Int == type) {
+    int value = input.toInt();
+    ScalarMemContext::GetContext().AppendToHostMem(
+        reinterpret_cast<uint8_t*>(&value),
+        sizeof(int),
+        host_ptr_offset);
+  } else if (at::ScalarType::Long == type) {
+    int64_t value = input.toLong();
+    ScalarMemContext::GetContext().AppendToHostMem(
+        reinterpret_cast<uint8_t*>(&value),
+        sizeof(int64_t),
+        host_ptr_offset);
+  } else if (at::ScalarType::Double == type) {
+    double value = input.toDouble();
+    ScalarMemContext::GetContext().AppendToHostMem(
+        reinterpret_cast<uint8_t*>(&value),
+        sizeof(double),
+        host_ptr_offset);
+  } else if (at::ScalarType::Half == type) {
+    auto value = input.toHalf();
+    ScalarMemContext::GetContext().AppendToHostMem(
+        reinterpret_cast<uint8_t*>(&value),
+        sizeof(at::ScalarType::Half),
+        host_ptr_offset);
+  } else {
+    AT_ERROR("scalar not support '", at::toString(type), "' type currently.");
+  }
+}
+
+void GraphCommandImpl::AddZeroDimInput(
+    const at::Tensor& input,
+    const string& desc_name) {
+  at::ScalarType dtype = at::ScalarType::Undefined;
+  if (!input.unsafeGetTensorImpl()->is_wrapped_number()) {
+    dtype = input.scalar_type();
+  }
+  TORCH_CHECK(
+      dtype != at::ScalarType::Undefined, "Cpu tensor scalar type is undefined");
+  at::Scalar expect_scalar = CalcuOpUtil::ConvertTensorToScalar(input);
+  ir_node_->AddExtInfo(
+      NodeExtInfoType::INPUT_TYPE_SCALAR,
+      std::make_tuple(input_index_++, expect_scalar, dtype));
+  ir_node_->UpdateNodeHash(
+      CalcuOpUtil::get_scalar_float_value(expect_scalar), dtype);
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/framework/graph/construct/GraphConstructor.h b/torch_npu/csrc/framework/graph/construct/GraphConstructor.h
new file mode 100644
index 0000000000000000000000000000000000000000..369aebe045e68215b366d73538f21f9cc632991c
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/construct/GraphConstructor.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/framework/utils/NpuUtils.h"
+
+#include <ATen/ATen.h>
+#include <c10/npu/NPUGraph.h>
+namespace at_npu {
+namespace native {
+using c10::npu::graph::DynamicInputRegFunc;
+using c10::npu::graph::DyNumAndIndex;
+using c10::npu::graph::NodeExtInfoType;
+using c10::npu::graph::NodePtr;
+
+class OperatorAttrMaker {
+public:
+  static void SetAttr(const string& attr_name, bool value, NodePtr node) {
+    node->AddExtInfo(
+        NodeExtInfoType::ATTR_TYPE_BOOL, std::make_pair(attr_name, value));
+    node->UpdateNodeHash(value);
+  }
+
+  static void SetAttr(const string& attr_name, float value, NodePtr node) {
+    node->AddExtInfo(
+        NodeExtInfoType::ATTR_TYPE_FLOAT, std::make_pair(attr_name, value));
+    node->UpdateNodeHash(value);
+  }
+
+  static void SetAttr(const string& attr_name, int64_t value, NodePtr node) {
+    node->AddExtInfo(
+        NodeExtInfoType::ATTR_TYPE_LONG, std::make_pair(attr_name, value));
+    node->UpdateNodeHash(value);
+  }
+
+  static void SetAttr(
+      const string& attr_name,
+      const string& value,
+      NodePtr node) {
+    node->AddExtInfo(
+        NodeExtInfoType::ATTR_TYPE_STRING, std::make_pair(attr_name, value));
+    node->UpdateNodeHash(value);
+  }
+
+  static void SetAttr(
+      const string& attr_name,
+      const c10::ArrayRef<int64_t>& value,
+      NodePtr node) {
+    vector<int64_t> val(value.begin(), value.end());
+    node->AddExtInfo(
+        NodeExtInfoType::ATTR_TYPE_LIST_LONG,
+        std::make_pair(attr_name, std::move(val)));
+    node->UpdateNodeHash(val);
+  }
+
+  static void SetAttr(
+      const string& attr_name,
+      const c10::ArrayRef<float>& value,
+      NodePtr node) {
+    vector<float> val(value.begin(), value.end());
+    node->AddExtInfo(
+        NodeExtInfoType::ATTR_TYPE_LIST_FLOAT,
+        std::make_pair(attr_name, std::move(val)));
+    node->UpdateNodeHash(val);
+  }
+
+  static void SetAttr(
+      const string& attr_name,
+      const c10::Scalar& value,
+      NodePtr node) {
+    float val = CalcuOpUtil::get_scalar_float_value(value);
+    node->AddExtInfo(
+        NodeExtInfoType::ATTR_TYPE_FLOAT, std::make_pair(attr_name, val));
+    node->UpdateNodeHash(val);
+  }
+};
+
+class GraphCommandImpl {
+public:
+  GraphCommandImpl() = default;
+  ~GraphCommandImpl() = default;
+
+  void SetName(const std::string& name);
+
+  void AddInput();
+
+  void AddInput(
+      const at::Tensor& input,
+      const string& desc_name,
+      const string& real_dtype,
+      const c10::optional<aclFormat>& sensitive_format = c10::nullopt);
+
+  void AddInput(
+      const c10::Scalar& input,
+      const at::ScalarType type,
+      CompileType compile_type);
+
+  void AddInput(const c10::IntArrayRef& dim_list, const at::ScalarType to_type);
+
+  void AddOutput(
+      const at::Tensor& output,
+      const string& desc_name = "",
+      const string& real_type = "",
+      const c10::optional<aclFormat>& sensitive_format = c10::nullopt);
+
+  void AddDynamicInputRegFunc(
+      DynamicInputRegFunc func,
+      DyNumAndIndex num_and_index);
+
+  void ReduceScalarValue(
+      const at::Scalar& input,
+      const at::ScalarType type,
+      uint32_t& host_ptr_offset);
+
+  template <typename T>
+  void AddAttr(const string& attr_name, T value) {
+    OperatorAttrMaker::SetAttr(attr_name, value, ir_node_);
+  }
+
+private:
+  void AddZeroDimInput(const at::Tensor& input, const string& desc_name);
+
+  uint32_t output_index_ = 0;
+  uint32_t input_index_ = 0;
+  NodePtr ir_node_ = nullptr;
+};
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/graph/execute/GraphExecutor.cpp b/torch_npu/csrc/framework/graph/execute/GraphExecutor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..496457527ae7f43e56174505a0f965b2a5944cbc
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/execute/GraphExecutor.cpp
@@ -0,0 +1,411 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "GraphExecutor.h"
+
+#include <ATen/ATen.h>
+#include <ATen/record_function.h>
+#include "torch_npu/csrc/framework/graph/util/ATenGeBridge.h"
+#include "torch_npu/csrc/framework/graph/util/GraphUtils.h"
+#include "torch_npu/csrc/framework/interface/AclInterface.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include <c10/npu/NPUFunctions.h>
+#include <c10/npu/NPUGraphContextManager.h>
+#include "torch_npu/csrc/core/npu/register/OptionRegister.h"
+#include "torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h"
+#include <third_party/acl/inc/op_proto/array_ops.h>
+
+#include <stack>
+
+// wait RECORD_HOST_FUNCTION to be added into plugin
+#define RECORD_HOST_FUNCTION(a, b) ;
+namespace at_npu {
+namespace native {
+namespace {
+const char* kPytorchGraphName = "PytorchGraph";
+const std::string kDataNodeType = "Data";
+const char* kDataAttrIndex = "index";
+
+static ge::Tensor MakeGeTensor(
+    const ge::TensorDesc& tensor_desc,
+    void* device_ptr,
+    const size_t nbytes) {
+  ge::Tensor ge_tensor{tensor_desc};
+  ge_tensor.SetData(
+      reinterpret_cast<uint8_t*>(device_ptr), nbytes, [](uint8_t* device_ptr) {
+        return;
+      });
+  return ge_tensor;
+}
+} // namespace
+
+uint32_t GraphExecutor::graph_id = 0;
+
+void GraphExecutor::RunGraph(
+    uint32_t graph_id,
+    CombinedInfo& inputs,
+    CombinedInfo& outputs) {
+  RECORD_HOST_FUNCTION("RunGraph", std::vector<c10::IValue>({}));
+  aclrtStream cal_stream =
+      const_cast<aclrtStream>(c10::npu::getCurrentNPUStream().stream());
+
+  auto ret = session_->RunGraphWithStreamAsync(graph_id,
+                                               cal_stream,
+                                               inputs.tensors,
+                                               outputs.tensors);
+  TORCH_CHECK(ret == 0, "Run Graph Failed!");
+}
+
+void GraphExecutor::ConstructAndExecuteGraph() {
+  RECORD_HOST_FUNCTION("ConstructAndExecuteGraph", std::vector<c10::IValue>({}));
+  auto ret = CheckDeviceIdAndInit();
+  if (!ret) {
+    return;
+  }
+  TORCH_CHECK(session_ != nullptr, "Undefined session before run graph.");
+  // before construct graph and tensor, do H2D copy for scalar.
+  ScalarMemContext::GetContext().ExecuteH2D(c10::npu::getCurrentNPUStream());
+  CombinedInfo inputs = GetInputCombinedInfo();
+  CombinedInfo outputs = GetOutputCombinedInfo();
+  if (outputs.nodes.empty()) {
+    return;
+  }
+
+  uint32_t cur_graph_id = graph_id + 1;
+  auto cached_graph_id = cacher_.GetCacheGraphId(
+      inputs.hash_of_topo_and_attr,
+      inputs.hash_of_shape,
+      outputs.hash_of_topo_and_attr,
+      outputs.hash_of_shape,
+      cur_graph_id);
+
+  if (!cached_graph_id.has_value()) {
+    RECORD_HOST_FUNCTION("ConstructGraph", std::vector<c10::IValue>({}));
+    ConstructOps(outputs);
+    ge::Graph graph(kPytorchGraphName);
+    graph.SetInputs(GetInputOps()).SetOutputs(GetOutputOps());
+
+    TORCH_CHECK(
+        session_->AddGraph(cur_graph_id, graph) == 0, "AddGraph failed!");
+    graph_id = cur_graph_id;
+  } else {
+    cur_graph_id = cached_graph_id.value();
+  }
+
+  RunGraph(cur_graph_id, inputs, outputs);
+  ScalarMemContext::GetContext().Reset();  
+  ResetGraphOutputs();
+  if (!cached_graph_id.has_value()) {
+    // Data of new graph maybe inputs of old graphs,
+    // GE will change its attr
+    // so we need to refresh it
+    RefreshGraphInputs();
+  }
+  ClearDataStore();
+  return;
+}
+
+void GraphExecutor::Init() {
+  auto device_id = std::to_string(init_device_id_);
+  std::map<ge::AscendString, ge::AscendString> config = {
+      {ge::AscendString(ge::OPTION_EXEC_DEVICE_ID),
+       ge::AscendString(device_id.data())},
+      {ge::AscendString(ge::OPTION_GRAPH_RUN_MODE), "0"},
+      {ge::AscendString(ge::PRECISION_MODE.data()), "allow_fp32_to_fp16"},
+      {ge::AscendString(ge::VARIABLE_MEMORY_MAX_SIZE), "1048576"}
+  };
+
+  static std::map<const std::string, const std::string>
+      STRING_TO_COMPILE_OPT_MAP = {
+          {"ACL_OP_DEBUG_LEVEL", ge::OP_DEBUG_LEVEL},
+          {"ACL_DEBUG_DIR", ge::DEBUG_DIR},
+          {"ACL_OP_COMPILER_CACHE_MODE", ge::OP_COMPILER_CACHE_MODE},
+          {"ACL_OP_COMPILER_CACHE_DIR", ge::OP_COMPILER_CACHE_DIR},
+          {"ACL_OP_SELECT_IMPL_MODE", ge::OP_SELECT_IMPL_MODE},
+          {"ACL_OPTYPELIST_FOR_IMPLMODE", ge::OPTYPELIST_FOR_IMPLMODE}
+      };
+
+  for (const auto& iter : STRING_TO_COMPILE_OPT_MAP) {
+    auto val = torch_npu::option::GetOption(iter.first);
+    if (val.has_value() && (!val.value().empty())) {
+      config.emplace(iter.second.data(), val.value().data());
+    }
+  }
+
+  // to be uncommented
+//  auto soc_name = c10::npu::acl::AclGetSocName();
+//  if (soc_name != nullptr) {
+//    config.emplace(ge::AscendString(ge::SOC_VERSION.data()), soc_name);
+//  }
+
+//  if (c10::npu::acl::IsExistQueryEventRecordedStatus()) {
+//    static const std::string HCOM_OPTIONS = "ge.exec.isUseHcom";
+//    config.emplace(HCOM_OPTIONS.data(), "1");
+//  }
+
+  config["ge.session_device_id"] = ge::AscendString(device_id.data());
+  config["ge.exec.reuseZeroCopyMemory"] = ge::AscendString("1");
+  session_ = std::make_unique<ge::Session>(config);
+  C10_NPU_CHECK(aclrtSetDevice(init_device_id_));
+  if (session_ == nullptr) {
+    AT_ERROR("Create session failed!");
+  }
+}
+
+void GraphExecutor::Finalize() {
+  if (GraphExecutor::GetInstance().session_ != nullptr) {
+    session_.reset();
+    session_ = nullptr;
+  }
+}
+
+void GraphExecutor::ConstructOps(CombinedInfo& output) {
+  RECORD_HOST_FUNCTION("ConstructOps", std::vector<c10::IValue>({}));
+  std::set<NodePtr> searched_nodes;
+  for (const auto& output_node : output.nodes) {
+    if (searched_nodes.find(output_node) != searched_nodes.end()) {
+      continue;
+    }
+    searched_nodes.insert(output_node);
+    std::stack<NodePtr> stack_node;
+    stack_node.push(output_node);
+    while (!stack_node.empty()) {
+      auto top_node = stack_node.top();
+      ATenGeBridge::CheckAndBuildGeOpForNode(top_node);
+      stack_node.pop();
+      const auto& inputs = top_node->GetInputs();
+      for (const auto& input : inputs) {
+        ATenGeBridge::CheckAndBuildGeOpForNode(input.peer_output_node);
+        top_node->GetGeOp()->SetInput(
+            input.input_index,
+            *(input.peer_output_node->GetGeOp()),
+            input.peer_output_index);
+        if (searched_nodes.find(input.peer_output_node) !=
+            searched_nodes.end()) {
+          continue;
+        }
+        stack_node.push(input.peer_output_node);
+        searched_nodes.insert(input.peer_output_node);
+      }
+    }
+  }
+}
+
+std::vector<ge::Operator> GraphExecutor::GetInputOps() {
+  std::vector<ge::Operator> ops;
+  auto input_storages = c10::npu::graph::NpuGraphContextManager::GetInstance()
+                            .GetAllInputStorages(init_device_id_);
+  for (size_t index = 0; index < input_storages.size(); ++index) {
+    auto &graph_desc = input_storages[index]->get_mutable_npu_graph_desc();
+    auto data_node = graph_desc.graph_value.GetDataNode();
+    auto op_ptr = data_node.value()->GetGeOp();
+    if (data_node.value()->GetOpType() == kDataNodeType) {
+      if (op_ptr == nullptr) {
+        data_node.value()->SetGeOp(std::make_shared<ge::op::Data>());
+        op_ptr = data_node.value()->GetGeOp();
+      }
+      // storageImpl has no dtype since 1.8, need a solution
+//      auto op_desc = ATenGeBridge::InferGeTenosrDesc(
+//          input_storages[index]->get_npu_desc(),
+//          input_storages[index]->dtype(),
+//          graph_desc.graph_value.GetRealDtype(),
+//          true);
+//      // x and y are the input and output names of Data IR
+//      op_ptr->UpdateInputDesc("x", op_desc);
+//      op_ptr->UpdateOutputDesc("y", op_desc);
+//      op_ptr->SetAttr(kDataAttrIndex, static_cast<uint32_t>(index));
+    }
+    ops.push_back(*op_ptr);
+  }
+  return ops;
+}
+
+GeOutPutOpType GraphExecutor::GetOutputOps() {
+  GeOutPutOpType ops_and_idx;
+  auto output_storages = c10::npu::graph::NpuGraphContextManager::GetInstance()
+                             .GetAllStorageOfLiveTensors(init_device_id_);
+  for (auto& output_storage : output_storages) {
+    if (GraphUtils::IsTensorWithoutNode(output_storage) ||
+        GraphUtils::IsDataTensor(output_storage)) {
+      continue;
+    }
+    const auto& graph_value =
+        output_storage->get_mutable_npu_graph_desc().graph_value;
+    auto op_ptr = graph_value.GetCurNode()->GetGeOp();
+    ops_and_idx.emplace_back(
+        *op_ptr, std::vector<size_t>{graph_value.GetValueIndex()});
+  }
+  return ops_and_idx;
+}
+
+CombinedInfo GraphExecutor::GetInputCombinedInfo() {
+  RECORD_HOST_FUNCTION("GetInputCombinedInfo", std::vector<c10::IValue>({}));
+  CombinedInfo input_infos;
+  auto input_storages = c10::npu::graph::NpuGraphContextManager::GetInstance()
+                            .GetAllInputStorages(init_device_id_);
+  for (size_t index = 0; index < input_storages.size(); ++index) {
+    c10::NpuGraphDesc& graph_desc =
+        input_storages[index]->get_mutable_npu_graph_desc();
+    auto data_node = graph_desc.graph_value.GetDataNode();
+    TORCH_CHECK(data_node.has_value(), "Inputs Tensor must have data node");
+    // storageImpl has no dtype since 1.8, need a solution
+//    ge::TensorDesc tensor_desc = ATenGeBridge::InferGeTenosrDesc(
+//        input_storages[index]->get_npu_desc(),
+//        input_storages[index]->dtype(),
+//        graph_desc.graph_value.GetRealDtype());
+//
+//    if (data_node.value()->GetOpType() == kDataNodeType) {
+//      ge::Tensor ge_tensor =
+//          PrepareInputTensor(input_storages[index], tensor_desc);
+//      input_infos.tensors.push_back(std::move(ge_tensor));
+//    }
+//    hash_t topo_hash =
+//        GraphCache::GetTensorTopoHash(graph_desc.graph_value, tensor_desc);
+//    input_infos.hash_of_topo_and_attr.push_back(topo_hash);
+//    hash_t shape_hash = GraphCache::GetTensorShapeHash(topo_hash, tensor_desc);
+//    input_infos.hash_of_shape.push_back(shape_hash);
+  }
+  return input_infos;
+}
+
+CombinedInfo GraphExecutor::GetOutputCombinedInfo() {
+  RECORD_HOST_FUNCTION("GetOutputCombinedInfo", std::vector<c10::IValue>({}));
+  CombinedInfo output_infos;
+  auto output_storages = c10::npu::graph::NpuGraphContextManager::GetInstance()
+                             .GetAllStorageOfLiveTensors(init_device_id_);
+  for (auto& output_storage : output_storages) {
+    if (GraphUtils::IsTensorWithoutNode(output_storage) ||
+        GraphUtils::IsDataTensor(output_storage)) {
+      c10::NpuGraphDesc graph_desc = output_storage->get_npu_graph_desc();
+      // the tensor of scalar_merge_copy will enter here because is has't node,
+      // only the length of the out queue is increased, nothing else.
+      if ((output_storage->data() == nullptr) &&
+          (!graph_desc.graph_value.GetScalarMemOffset().has_value())) {
+        size_t nbytes = output_storage->nbytes();
+        auto data_ptr = c10_npu::NPUCachingAllocator::get()->allocate(nbytes);
+        output_storage->set_data_ptr(std::move(data_ptr));
+      }
+      continue;
+    }
+    auto& graph_value =
+        output_storage->get_mutable_npu_graph_desc().graph_value;
+    TORCH_CHECK(graph_value.HashNode(), "output must have node!");
+    output_infos.nodes.push_back(graph_value.GetCurNode());
+    // storageImpl has no dtype since 1.8, need a solution
+//    ge::TensorDesc tensor_desc = ATenGeBridge::InferGeTenosrDesc(
+//        output_storage->get_npu_desc(),
+//        output_storage->dtype(),
+//        graph_value.GetRealDtype());
+//    auto ge_tensor = PrepareOutputTenosr(output_storage, tensor_desc);
+//    output_infos.tensors.push_back(std::move(ge_tensor));
+//    hash_t topo_hash = GraphCache::GetTensorTopoHash(graph_value, tensor_desc);
+//    output_infos.hash_of_topo_and_attr.emplace_back(topo_hash);
+//
+//    hash_t shape_hash = GraphCache::GetTensorShapeHash(topo_hash, tensor_desc);
+//    output_infos.hash_of_shape.push_back(shape_hash);
+  }
+  return output_infos;
+}
+
+ge::Tensor GraphExecutor::PrepareInputTensor(
+    const c10::StorageImpl* const storage,
+    const ge::TensorDesc& desc) {
+  c10::NpuGraphDesc& graph_desc = storage->get_mutable_npu_graph_desc();
+  auto device_ptr = storage->data();
+  size_t nbytes = storage->nbytes();
+  auto addr_offset = graph_desc.graph_value.GetScalarMemOffset();
+  if (addr_offset.has_value()) {
+    device_ptr = ScalarMemContext::GetContext().GetDeviceMemBuffer() + addr_offset.value();
+  }
+  return MakeGeTensor(desc, device_ptr, nbytes);
+}
+
+ge::Tensor GraphExecutor::PrepareOutputTenosr(
+    c10::StorageImpl* storage,
+    const ge::TensorDesc& desc) {
+  c10::NpuGraphDesc& graph_desc = storage->get_mutable_npu_graph_desc();
+  TORCH_CHECK(
+      graph_desc.graph_value.HashNode(),
+      "graph desc in storage must have node");
+  size_t nbytes = storage->nbytes();
+  c10::DataPtr data_ptr;
+
+  // In the case of in-place operator
+  // we can not call set_data_ptr
+  // for this will cause the old data ptr to be released
+  // and if one value have data node which has no device memory
+  // we should malloc for it
+  if (!(graph_desc.graph_value.GetDataNode().has_value() &&
+        storage->data() != nullptr)) {
+    data_ptr = c10_npu::NPUCachingAllocator::get()->allocate(nbytes);
+    storage->set_data_ptr(std::move(data_ptr));
+  }
+  return MakeGeTensor(desc, storage->data(), nbytes);
+}
+
+void GraphExecutor::ResetGraphOutputs() {
+  RECORD_HOST_FUNCTION("ResetGraphOutputs", std::vector<c10::IValue>({}));
+  auto output_storages = c10::npu::graph::NpuGraphContextManager::GetInstance()
+                             .GetAllStorageOfLiveTensors(init_device_id_);
+  std::for_each(
+      output_storages.begin(), output_storages.end(), [](c10::StorageImpl* x) {
+        if (!GraphUtils::IsTensorWithoutNode(x) &&
+            !GraphUtils::IsDataTensor(x)) {
+          GraphUtils::ResetOp(x);
+        }
+      });
+}
+
+void GraphExecutor::RefreshGraphInputs() {
+  RECORD_HOST_FUNCTION("RefreshGraphInputs", std::vector<c10::IValue>({}));
+  auto input_storages = c10::npu::graph::NpuGraphContextManager::GetInstance()
+                            .GetAllInputStorages(init_device_id_);
+  std::for_each(
+      input_storages.begin(), input_storages.end(), [&](c10::StorageImpl* x) {
+        GraphUtils::SetDataOp(x);
+      });
+}
+
+void GraphExecutor::ClearDataStore() {
+  RECORD_HOST_FUNCTION("ClearDataStore", std::vector<c10::IValue>({}));
+  c10::npu::graph::NpuGraphContextManager::GetInstance().EraseInputStorage(
+      init_device_id_);
+}
+
+bool GraphExecutor::CheckDeviceIdAndInit() {
+  RECORD_HOST_FUNCTION("CheckDeviceIdAndInit", std::vector<c10::IValue>({}));
+  auto devices_has_input =
+      c10::npu::graph::NpuGraphContextManager::GetInstance()
+          .GetDevicesHasLiveTensor();
+  if (devices_has_input.empty()) {
+    return false;
+  } else if (devices_has_input.size() > 1) {
+    AT_ERROR("In graph mode, you can not construct graph in different device");
+  }
+
+  init_device_id_ = devices_has_input.front();
+  if (session_ == nullptr) {
+    Init();
+  }
+
+  if (init_device_id_ != devices_has_input.front()) {
+    AT_ERROR(
+        "In graph mode, you can not change "
+        "device id after first graph launch");
+  }
+  return true;
+}
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/framework/graph/execute/GraphExecutor.h b/torch_npu/csrc/framework/graph/execute/GraphExecutor.h
new file mode 100644
index 0000000000000000000000000000000000000000..41e01e5e5be6125c92b5d0b274d68fe8ea6a336a
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/execute/GraphExecutor.h
@@ -0,0 +1,119 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "torch_npu/csrc/framework/graph/cache/GraphCacher.h"
+#include <c10/core/StorageImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/npu/NPUGraph.h>
+
+#ifdef SUCCESS
+#undef SUCCESS
+#endif
+#ifdef FAILED
+#undef FAILED
+#endif
+#include <third_party/acl/inc/ge/ge_api.h>
+
+#include <vector>
+
+namespace at_npu {
+namespace native {
+using c10::npu::graph::NodePtr;
+using c10::npu::hash_utils::hash_t;
+
+using GeOutPutOpType =
+    std::vector<std::pair<ge::Operator, std::vector<size_t>>>;
+
+struct CombinedInfo {
+  std::vector<NodePtr> nodes;
+  std::vector<ge::Tensor> tensors;
+  std::vector<hash_t> hash_of_topo_and_attr;
+  std::vector<hash_t> hash_of_shape;
+};
+
+class TORCH_API GraphExecutor {
+public:
+  GraphExecutor(const GraphExecutor&) = delete;
+  GraphExecutor(GraphExecutor&&) = delete;
+  GraphExecutor& operator=(const GraphExecutor&) = delete;
+  GraphExecutor& operator=(GraphExecutor&&) = delete;
+  ~GraphExecutor() = default;
+
+  void ConstructAndExecuteGraph();
+
+  static GraphExecutor& GetInstance() {
+    static GraphExecutor instance;
+    return instance;
+  }
+
+  void Finalize();
+
+ private:
+  GraphExecutor() = default;
+
+  void Init();
+
+  /**
+   * NB
+   * Currently, in graph mode, there are two limitations
+   * 1, after your first graph launching, you can not change device,
+   * the init_device_id_ will be the id
+   * of first device which has input tensor.
+   *
+   * 2, you can not construct graph in two different device.
+   */
+  bool CheckDeviceIdAndInit();
+
+  void RunGraph(
+      uint32_t graph_id,
+      CombinedInfo& inputs,
+      CombinedInfo& outputs);
+
+  static void ConstructOps(CombinedInfo& output);
+
+  std::vector<ge::Operator> GetInputOps();
+
+  GeOutPutOpType GetOutputOps();
+
+  CombinedInfo GetInputCombinedInfo();
+
+  CombinedInfo GetOutputCombinedInfo();
+
+  static ge::Tensor PrepareInputTensor(
+      const c10::StorageImpl* const storage,
+      const ge::TensorDesc& desc);
+
+  static ge::Tensor PrepareOutputTenosr(
+      c10::StorageImpl* storage,
+      const ge::TensorDesc& desc);
+
+  void ResetGraphOutputs();
+
+  void RefreshGraphInputs();
+
+  void ClearDataStore();
+
+  static uint32_t graph_id;
+
+  c10::DeviceIndex init_device_id_ = -1;
+
+  std::unique_ptr<ge::Session> session_ = nullptr;
+
+  GraphCache cacher_;
+};
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.cpp b/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d32e54954c51ed605cb5df06bffe55ef12a87af8
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.cpp
@@ -0,0 +1,86 @@
+#include "ScalarMemoryOps.h"
+
+namespace at_npu {
+namespace native {
+
+void ScalarMemContext::Init() {
+  cpu_tensor_ = at::empty(
+      {HOST_MEM_INIT_SIZE},
+      at::TensorOptions().pinned_memory(true).device(at::kCPU).dtype(at::kByte));
+  host_mem_valid_len_ = 0;
+  inited_ = true;
+}
+
+void ScalarMemContext::ExecuteH2D(c10::npu::NPUStream stream) {
+  if (!inited_) {
+    return;
+  }
+
+  if (CHECK_MEM_MAX_SIZE <= host_mem_valid_len_) {
+    AT_ERROR("Checked the device memory size >= 64M.");
+    return;
+  }
+  int deviceIndex = 0;
+  AT_NPU_CHECK(aclrtGetDevice(&deviceIndex));
+  npu_tensor_ = at::empty(
+      {host_mem_valid_len_},
+      at::TensorOptions().device(at::kNPU, deviceIndex).dtype(at::kByte));
+  
+  AT_NPU_CHECK(
+      aclrtMemcpyAsync(
+          npu_tensor_.data_ptr(),
+          host_mem_valid_len_,
+          cpu_tensor_.data_ptr(),
+          host_mem_valid_len_,
+          ACL_MEMCPY_HOST_TO_DEVICE,
+          stream));
+  AT_NPU_CHECK(THNPUCachingHostAllocator_recordEvent(cpu_tensor_.data_ptr(), stream));
+
+  // reset pin memory
+  cpu_tensor_.reset();
+}
+
+void ScalarMemContext::CheckForExpand(uint32_t input_valid_len) {
+  if (input_valid_len <= (cpu_tensor_.nbytes() - host_mem_valid_len_)) {
+    return;
+  }
+
+  auto tmp_tensor = cpu_tensor_;
+  uint32_t expand_tensor_size = tmp_tensor.nbytes() + HOST_MEM_INIT_SIZE;
+  cpu_tensor_ = at::empty(
+      {expand_tensor_size},
+      at::TensorOptions().pinned_memory(true).device(at::kCPU).dtype(at::kByte));
+  
+  AT_NPU_CHECK(
+      aclrtMemcpy(
+          cpu_tensor_.data_ptr(),
+          host_mem_valid_len_,
+          tmp_tensor.data_ptr(),
+          host_mem_valid_len_,
+          ACL_MEMCPY_HOST_TO_HOST));
+}
+
+void ScalarMemContext::AppendToHostMem(
+    uint8_t* host_ptr,
+    uint32_t data_len,
+    uint32_t& data_offset) {
+  if (!inited_) {
+    Init();
+  }
+
+  uint32_t valid_len = DEVICE_VALID_LEN(data_len);
+  CheckForExpand(valid_len);
+  data_offset = host_mem_valid_len_;
+  std::memcpy(
+      reinterpret_cast<uint8_t*>(cpu_tensor_.data_ptr()) + data_offset,
+      host_ptr, data_len);
+  host_mem_valid_len_ += valid_len;
+}
+
+void ScalarMemContext::Reset() {
+  npu_tensor_.reset();
+  inited_ = false;
+}
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h b/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..700d74e37741c93efdb44ca74d7bd111c666b377
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "torch_npu/csrc/framework/allocator/THNPUCachingHostAllocator.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+
+namespace at_npu {
+namespace native {
+
+#define HOST_MEM_INIT_SIZE (512 * 10240)    // 5M
+#define CHECK_MEM_MAX_SIZE (65536 * 1024)   // 64M
+#define DEVICE_VALID_LEN(a) ((((a) + 32 + 511) / 512) * 512)
+
+class C10_API ScalarMemContext {
+public:
+  static ScalarMemContext &GetContext() {
+    static ScalarMemContext ctx;
+    return ctx;
+  }
+
+  ScalarMemContext(const ScalarMemContext&) = delete;
+  ScalarMemContext(ScalarMemContext&&) = delete;
+  ScalarMemContext& operator=(const ScalarMemContext&) = delete;
+  ScalarMemContext& operator=(ScalarMemContext&&) = delete;
+
+  uint8_t* GetDeviceMemBuffer() {
+    return reinterpret_cast<uint8_t*>(npu_tensor_.data_ptr());
+  }
+
+  void AppendToHostMem(
+      uint8_t* host_ptr,
+      uint32_t data_len,
+      uint32_t& data_offset);
+  
+  void ExecuteH2D(c10::npu::NPUStream stream);
+
+  void Reset();
+
+private:
+  void Init();
+
+  void CheckForExpand(uint32_t input_valid_len);
+
+  ScalarMemContext() = default;
+
+  bool inited_ = false;
+  at::Tensor cpu_tensor_;
+  at::Tensor npu_tensor_;
+  uint32_t host_mem_valid_len_ = 0;
+};
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/graph/util/ATenGeBridge.cpp b/torch_npu/csrc/framework/graph/util/ATenGeBridge.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c4dc1e20bdc76daa0e1d96b49334941832165a93
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/util/ATenGeBridge.cpp
@@ -0,0 +1,280 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATenGeBridge.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include <third_party/acl/inc/graph/operator_factory.h>
+
+#include <third_party/acl/inc/op_proto/array_ops.h>
+
+namespace at_npu {
+namespace native {
+namespace {
+std::map<at::ScalarType, ge::DataType> kScalarTypeToGeDType{
+    {at::ScalarType::Byte, ge::DataType::DT_UINT8},
+    {at::ScalarType::Char, ge::DataType::DT_INT8},
+    {at::ScalarType::Bool, ge::DataType::DT_BOOL},
+    {at::ScalarType::Double, ge::DataType::DT_DOUBLE},
+    {at::ScalarType::Float, ge::DataType::DT_FLOAT},
+    {at::ScalarType::Half, ge::DataType::DT_FLOAT16},
+    {at::ScalarType::Short, ge::DataType::DT_INT16},
+    {at::ScalarType::Int, ge::DataType::DT_INT32},
+    {at::ScalarType::Long, ge::DataType::DT_INT64},
+};
+
+std::map<std::string, ge::DataType> kRealDtypeToGeType {
+    {"uint16", ge::DataType::DT_UINT16},
+};
+
+at::Tensor ConstructCpuTenosr(const c10::Scalar& scalar_input, c10::ScalarType type) {
+  return scalar_to_tensor(scalar_input).to(type);
+}
+
+at::Tensor ConstructCpuTenosr(
+    const std::vector<int64_t>& list_input,
+    c10::ScalarType dtype) {
+  auto cpu_tensor = at::from_blob(
+      const_cast<void*>(reinterpret_cast<const void*>(list_input.data())),
+      {list_input.size()},
+      c10::TensorOptions(at::kCPU).dtype(at::kLong));
+  if (dtype != at::kLong) {
+    return cpu_tensor.to(dtype);
+  }
+  return cpu_tensor;
+}
+} // namespace
+
+template <>
+void ATenGeBridge::SetGeOpAttr<std::pair<std::string, std::string>>
+    (const c10::any& attr_val, ge::OperatorPtr ge_op) {
+  auto attr = TryToGetAnyValue<std::pair<std::string, std::string>>(attr_val);
+  ge_op->SetAttr(attr.first.c_str(), ge::AscendString(attr.second.c_str()));
+}
+
+ge::DataType ATenGeBridge::GetGeDType(c10::ScalarType type) {
+  auto iter = kScalarTypeToGeDType.find(type);
+  if (iter == kScalarTypeToGeDType.end()) {
+    AT_ERROR("Unsupported convert this ATen DType: %s to Ge DType", type);
+  }
+  return iter->second;
+}
+
+ge::DataType ATenGeBridge::GetGeDType(caffe2::TypeMeta type_meta) {
+  auto aten_dtype = c10::typeMetaToScalarType(type_meta);
+  return GetGeDType(aten_dtype);
+}
+
+ge::DataType ATenGeBridge::GetGeDType(const std::string& real_dtype) {
+  auto iter = kRealDtypeToGeType.find(real_dtype);
+  if (iter == kRealDtypeToGeType.end()) {
+    AT_ERROR("Unsupported convert this ATen DType: %s to Ge DType", real_dtype);
+  }
+  return iter->second;
+}
+
+ge::Shape ATenGeBridge::GetGeShape(c10::ArrayRef<int64_t> vec) {
+  return ge::Shape(std::vector<int64_t>(vec.begin(), vec.end()));
+}
+
+ge::TensorDesc ATenGeBridge::InferGeTenosrDesc(
+    const c10::NPUStorageDesc& storage_desc,
+    const caffe2::TypeMeta& type_meta,
+    const c10::optional<std::string>& real_dtype,
+    bool is_op_desc) {
+  ge::TensorDesc desc;
+
+  if (real_dtype.has_value()) {
+    desc.SetDataType(ATenGeBridge::GetGeDType(real_dtype.value()));
+  } else {
+    desc.SetDataType(ATenGeBridge::GetGeDType(type_meta));
+  }
+
+  desc.SetPlacement(ge::kPlacementDevice);
+  desc.SetOriginShape(
+      ATenGeBridge::GetGeShape(storage_desc.base_sizes_));
+  desc.SetOriginFormat(ge::Format(storage_desc.origin_format_));
+
+  /*
+   * NB
+   * AOE does not support inner format
+   * So we set Operator description as origin format and shape
+   * Then we can dump ge graph to begin offline auto tune
+   *
+   *   data1          data2                                                            data1          data2
+   * (nchw/nchw)   (nchw/nchw)                                                       (nchw/5hd)   (nchw/fz)
+   *      \         /                                                                       \         /
+   *       \       /           Param:input_tensors{tensor1(nchw/5hd), tensor2(nchw/fz)}      \       /
+   *        \     /        -----------------RunGraphWithStreamAsync----------------->         \     /
+   *         conv2d                                                                           conv2d
+   *            |                                                                                |
+   *            |                                                                                |
+   *        netoutput                                                                        netoutput
+   *
+   * In graph, we set data node as data1:nchw(origin format) / nchw (format)
+   * and data2: nchw(origin format) / nchw (format)
+   * when we run graph, we give input tensors as tensor1:nchw(origin format) / 5hd(format)
+   * and tensor2:nchw(origin format) / fz(format)
+   * In interface RunGraphWithStreamAsync, ge will refresh data description with input tensor description
+   * to support inner format
+   * In aoe scene, we dump raw graph without inner format
+   */
+  if (is_op_desc) {
+    desc.SetShape(ATenGeBridge::GetGeShape(storage_desc.base_sizes_));
+    desc.SetFormat(ge::Format(storage_desc.origin_format_));
+  } else {
+    desc.SetShape(ATenGeBridge::GetGeShape(storage_desc.storage_sizes_));
+    desc.SetFormat(ge::Format(storage_desc.npu_format_));
+  }
+
+  return desc;
+}
+
+template <typename ConstType>
+void ATenGeBridge::SetGeOpConstInput(
+    const c10::any& const_input,
+    ge::OperatorPtr ge_op) {
+  auto const_input_tuple =
+      ATenGeBridge::TryToGetAnyValue<ConstType>(const_input);
+  at::Tensor cpu_tensor = ConstructCpuTenosr(
+      std::get<1>(const_input_tuple), std::get<2>(const_input_tuple));
+  auto ge_data_type = GetGeDType(std::get<2>(const_input_tuple));
+  ge::TensorDesc ge_tensor_desc{
+      ge::Shape(cpu_tensor.sizes().vec()), ge::Format::FORMAT_ND, ge_data_type};
+  ge::Tensor ge_tenosr{
+      ge_tensor_desc,
+      reinterpret_cast<uint8_t*>(cpu_tensor.data_ptr()),
+      cpu_tensor.nbytes()};
+
+  auto const_op = std::make_shared<ge::op::Const>();
+  const_op->set_attr_value(ge_tenosr);
+  ge_op->SetInput(std::get<0>(const_input_tuple), *const_op, 0);
+}
+
+void ATenGeBridge::SetSensitiveFormat(
+    const c10::any& sensitive_format,
+    ge::OperatorPtr ge_op,
+    NodeExtInfoType ext_type) {
+  auto sensitive_format_pair =
+      TryToGetAnyValue<std::pair<std::string, aclFormat>>(sensitive_format);
+  if (ext_type == NodeExtInfoType::SENSITIVE_FORMAT_OF_INPUT) {
+    auto tmp_desc =
+        ge_op->GetInputDescByName(sensitive_format_pair.first.c_str());
+    tmp_desc.SetFormat(ge::Format(sensitive_format_pair.second));
+    tmp_desc.SetOriginFormat(ge::Format(sensitive_format_pair.second));
+    ge_op->UpdateInputDesc(sensitive_format_pair.first.c_str(), tmp_desc);
+  } else {
+    auto tmp_desc =
+        ge_op->GetOutputDescByName(sensitive_format_pair.first.c_str());
+    tmp_desc.SetFormat(ge::Format(sensitive_format_pair.second));
+    tmp_desc.SetOriginFormat(ge::Format(sensitive_format_pair.second));
+    ge_op->UpdateOutputDesc(sensitive_format_pair.first.c_str(), tmp_desc);
+  }
+}
+
+void ATenGeBridge::AddNodeExtInfoIntoGeOp(
+    c10::ArrayRef<std::pair<NodeExtInfoType, c10::any>> ext_info,
+    ge::OperatorPtr ge_op) {
+  for (const auto& info : ext_info) {
+    switch (info.first) {
+      case NodeExtInfoType::ATTR_TYPE_BOOL:
+        SetGeOpAttr<std::pair<std::string, bool>>(info.second, ge_op);
+        break;
+      case NodeExtInfoType::ATTR_TYPE_LONG:
+        SetGeOpAttr<std::pair<std::string, int64_t>>(info.second, ge_op);
+        break;
+      case NodeExtInfoType::ATTR_TYPE_FLOAT:
+        SetGeOpAttr<std::pair<std::string, float>>(info.second, ge_op);
+        break;
+      case NodeExtInfoType::ATTR_TYPE_STRING:
+        SetGeOpAttr<std::pair<std::string, std::string>>(info.second, ge_op);
+        break;
+      case NodeExtInfoType::ATTR_TYPE_LIST_LONG:
+        SetGeOpAttr<std::pair<std::string, vector<int64_t>>>(info.second, ge_op);
+        break;
+      case NodeExtInfoType::ATTR_TYPE_LIST_FLOAT:
+        SetGeOpAttr<std::pair<std::string, vector<float>>>(info.second, ge_op);
+        break;
+      case NodeExtInfoType::INPUT_TYPE_SCALAR:
+        SetGeOpConstInput<std::tuple<uint32_t, c10::Scalar, c10::ScalarType>>(
+            info.second, ge_op);
+        break;
+      case NodeExtInfoType::INPUT_TYPE_LIST_LONG:
+        SetGeOpConstInput<std::tuple<uint32_t, vector<int64_t>, c10::ScalarType>>(
+            info.second, ge_op);
+        break;
+      case NodeExtInfoType::SENSITIVE_FORMAT_OF_INPUT:
+        SetSensitiveFormat(
+            info.second, ge_op, NodeExtInfoType::SENSITIVE_FORMAT_OF_INPUT);
+        break;
+      case NodeExtInfoType::SENSITIVE_FORMAT_OF_OUTPUT:
+        SetSensitiveFormat(
+            info.second, ge_op, NodeExtInfoType::SENSITIVE_FORMAT_OF_OUTPUT);
+        break;
+      default:
+        AT_ERROR(
+            "Has no method to process node ext info type: %d",
+            static_cast<std::underlying_type<NodeExtInfoType>::type>(
+                info.first));
+    }
+  }
+}
+
+void ATenGeBridge::PorcessDynamicInputReg(
+    NodePtr node,
+    ge::OperatorPtr& ge_op,
+    std::string op_name) {
+  auto& ext_info = node->GetExtInfo();
+  auto it = std::find_if(
+      ext_info.begin(),
+      ext_info.end(),
+      [](const std::pair<NodeExtInfoType, c10::any>& item) {
+        return item.first == NodeExtInfoType::DYNAMIC_INPUT_FUNC;
+      });
+  if (it != ext_info.end()) {
+    auto func_and_para =
+        TryToGetAnyValue<std::pair<DynamicInputRegFunc, DyNumAndIndex>>(
+            it->second);
+    ge_op = func_and_para.first(func_and_para.second, op_name);
+
+    // no need to process it anymore
+    ext_info.erase(it);
+  }
+  return;
+}
+
+void ATenGeBridge::CheckAndBuildGeOpForNode(NodePtr node) {
+  if (node->GetGeOp() != nullptr) {
+    return;
+  }
+  static uint64_t op_index = 0;
+  const std::string op_type = node->GetOpType();
+  TORCH_CHECK(
+      ge::OperatorFactory::IsExistOp(op_type.c_str()),
+      "Cur op type: %s is not exit",
+      op_type);
+  std::string op_name = op_type + std::to_string(op_index++);
+  ge::OperatorPtr ge_op = nullptr;
+  PorcessDynamicInputReg(node, ge_op, op_name);
+  if (ge_op == nullptr) {
+    ge_op = std::make_shared<ge::Operator>(
+        ge::OperatorFactory::CreateOperator(op_name.c_str(), op_type.c_str()));
+  }
+  AddNodeExtInfoIntoGeOp(node->GetExtInfo(), ge_op);
+  node->SetGeOp(ge_op);
+  return;
+}
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/graph/util/ATenGeBridge.h b/torch_npu/csrc/framework/graph/util/ATenGeBridge.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2faf60507c75a1406cc212fd3f0d3a1d36b70f1
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/util/ATenGeBridge.h
@@ -0,0 +1,89 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/npu/NPUGraph.h>
+#include <third_party/acl/inc/graph/operator.h>
+
+namespace at_npu {
+namespace native {
+
+using c10::npu::graph::NodeExtInfoType;
+using c10::npu::graph::DyNumAndIndex;
+using c10::npu::graph::DynamicInputRegFunc;
+using c10::npu::graph::NodePtr;
+
+class ATenGeBridge {
+public:
+  static ge::DataType GetGeDType(c10::ScalarType type);
+
+  static ge::DataType GetGeDType(caffe2::TypeMeta type_meta);
+
+  static ge::DataType GetGeDType(const std::string& real_dtype);
+
+  static ge::Shape GetGeShape(c10::ArrayRef<int64_t> vec);
+
+  static ge::TensorDesc InferGeTenosrDesc(
+      const c10::NPUStorageDesc& storage_desc,
+      const caffe2::TypeMeta& type_meta,
+      const c10::optional<std::string>& real_dtype,
+      bool is_op_desc = false);
+
+  static void CheckAndBuildGeOpForNode(NodePtr node);
+
+private:
+  template <typename T>
+  static T TryToGetAnyValue(const c10::any& any_val) {
+    T val;
+    try {
+      val = c10::any_cast<T>(any_val);
+    } catch (c10::bad_any_cast &bd) {
+      AT_ERROR(bd.what(), typeid(T).name());
+    }
+    return val;
+  }
+
+  template <typename ConstType>
+  static void SetGeOpConstInput(
+      const c10::any& const_input,
+      ge::OperatorPtr ge_op);
+
+  static void SetSensitiveFormat(
+      const c10::any& sensitive_format,
+      ge::OperatorPtr ge_op,
+      NodeExtInfoType ext_type);
+
+  static void PorcessDynamicInputReg(
+      NodePtr node,
+      ge::OperatorPtr& ge_op,
+      std::string op_name);
+
+  template <typename AttrType>
+  static void SetGeOpAttr(const c10::any& attr_val, ge::OperatorPtr ge_op) {
+    AttrType attr = TryToGetAnyValue<AttrType>(attr_val);
+    ge_op->SetAttr(attr.first.c_str(), attr.second);
+  }
+
+  static void AddNodeExtInfoIntoGeOp(
+      c10::ArrayRef<std::pair<NodeExtInfoType, c10::any>> ext_info,
+      ge::OperatorPtr ge_op);
+};
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/framework/graph/util/GraphModeGuard.h b/torch_npu/csrc/framework/graph/util/GraphModeGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8eadd73b34c2059bf4e0a55b7953544a5df13c8
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/util/GraphModeGuard.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "torch_npu/csrc/framework/graph/execute/GraphExecutor.h"
+#include "torch_npu/csrc/core/npu/NPURunMode.h"
+
+namespace at_npu {
+namespace native {
+class GraphModeGuard {
+public:
+  GraphModeGuard() = delete;
+  GraphModeGuard(const GraphModeGuard& other) = delete;
+  GraphModeGuard(GraphModeGuard&& other) = delete;
+  GraphModeGuard& operator=(const GraphModeGuard& other) = delete;
+  GraphModeGuard& operator=(GraphModeGuard&& other) = delete;
+
+  explicit GraphModeGuard(c10_npu::ModeKind mode) : mode_(mode) {
+    ori_mode_ = c10_npu::NpuRunMode::IsGraphMode()
+        ? c10_npu::ModeKind::GRAPH_MODE
+        : c10_npu::ModeKind::SINGLE_OP_MODE;
+    if ((ori_mode_ == c10_npu::ModeKind::GRAPH_MODE) &&
+        (mode_ == c10_npu::ModeKind::SINGLE_OP_MODE)) {
+      GraphExecutor::GetInstance().ConstructAndExecuteGraph();
+    }
+    c10_npu::NpuRunMode::SetNpuRunMode(mode_);
+  }
+
+  ~GraphModeGuard() {
+    c10_npu::NpuRunMode::SetNpuRunMode(ori_mode_);
+  }
+
+private:
+  c10_npu::ModeKind ori_mode_;
+  c10_npu::ModeKind mode_;
+};
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/graph/util/GraphUtils.cpp b/torch_npu/csrc/framework/graph/util/GraphUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81a3430d5d2640f433842cb88326f3163364eb71
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/util/GraphUtils.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "GraphUtils.h"
+
+#include <c10/npu/NPUGraphContextManager.h>
+
+namespace at_npu {
+namespace native {
+Value& GraphUtils::GetTensorIrValue(const at::Tensor& tensor) {
+  auto storage = tensor.storage().unsafeGetStorageImpl();
+  TORCH_CHECK(storage != nullptr, "Storage is null");
+  return storage->get_mutable_npu_graph_desc().graph_value;
+}
+
+hash_t GraphUtils::GetTensorIrValueHash(const at::Tensor& tensor) {
+  return GetTensorIrValue(tensor).GetValueHash();
+}
+
+void GraphUtils::SetTensorIrValue(c10::StorageImpl* storage, const Value& value) {
+  TORCH_CHECK(storage != nullptr, "Storage is null");
+  auto& npu_graph_desc = storage->get_mutable_npu_graph_desc();
+  npu_graph_desc.graph_value.UpdateFromOther(value);
+  return;
+}
+
+void GraphUtils::SetTensorIrValue(
+    const at::Tensor& tensor,
+    const Value& value) {
+  SetTensorIrValue(tensor.storage().unsafeGetStorageImpl(), value);
+  return;
+}
+
+void GraphUtils::SetDataOp(c10::StorageImpl* storage) {
+  TORCH_CHECK(storage != nullptr, "Storage is null");
+  auto data_node = std::make_shared<c10::npu::graph::Node>("Data");
+  auto data_value = Value(data_node, data_node, 0);
+  SetTensorIrValue(storage, data_value);
+}
+
+void GraphUtils::SetDataOp(const at::Tensor& tensor) {
+  SetDataOp(tensor.storage().unsafeGetStorageImpl());
+}
+
+void GraphUtils::ResetOp(c10::StorageImpl* storage) {
+  TORCH_CHECK(storage != nullptr, "Storage is null");
+  storage->get_mutable_npu_graph_desc().graph_value.ResetValue();
+}
+void GraphUtils::ResetOp(at::Tensor& tensor) {
+  ResetOp(tensor.storage().unsafeGetStorageImpl());
+}
+
+bool GraphUtils::IsDataTensor(const c10::StorageImpl* storage) {
+  TORCH_CHECK(storage != nullptr, "Storage is null");
+  auto& value = storage->get_mutable_npu_graph_desc().graph_value;
+  auto cur_node = value.GetCurNode();
+  TORCH_CHECK(cur_node != nullptr, "Cur storage does not have node");
+  return (cur_node->GetOpType() == "Data");
+}
+
+bool GraphUtils::IsDataTensor(const at::Tensor& tensor) {
+  return IsDataTensor(tensor.storage().unsafeGetStorageImpl());
+}
+
+bool GraphUtils::IsTensorWithoutNode(const c10::StorageImpl* storage) {
+  TORCH_CHECK(storage != nullptr, "Storage is null");
+  return !storage->get_npu_graph_desc().graph_value.HashNode();
+}
+
+bool GraphUtils::IsTensorWithoutNode(const at::Tensor& tensor) {
+  return IsTensorWithoutNode(tensor.storage().unsafeGetStorageImpl());
+}
+
+void GraphUtils::RetainGraphDataTensor(const at::Tensor& data_tensor) {
+  auto storage = data_tensor.storage().unsafeGetStorageImpl();
+  auto storage_ptr = c10::intrusive_ptr<c10::StorageImpl>::reclaim(storage);
+  c10::npu::graph::NpuGraphContextManager::GetInstance().AddInputStorage(
+      storage_ptr);
+  storage_ptr.release();
+}
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/graph/util/GraphUtils.h b/torch_npu/csrc/framework/graph/util/GraphUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..682d5ac018ef9e6576dd91f975263c7f147a18ba
--- /dev/null
+++ b/torch_npu/csrc/framework/graph/util/GraphUtils.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/npu/NPUGraph.h>
+
+namespace at_npu {
+namespace native {
+
+using c10::npu::graph::Value;
+using c10::npu::hash_utils::hash_t;
+class GraphUtils {
+public:
+  static Value& GetTensorIrValue(const at::Tensor& tensor);
+
+  static hash_t GetTensorIrValueHash(const at::Tensor& tensor);
+
+  static void SetTensorIrValue(c10::StorageImpl* storage, const Value& value);
+  static void SetTensorIrValue(const at::Tensor& tensor, const Value& value);
+
+  static void SetDataOp(c10::StorageImpl* storage);
+
+  static void SetDataOp(const at::Tensor& tensor);
+
+  static void ResetOp(c10::StorageImpl* storage);
+  static void ResetOp(at::Tensor& tensor);
+
+  static bool IsDataTensor(const c10::StorageImpl* storage);
+  static bool IsDataTensor(const at::Tensor& tensor);
+
+  static bool IsTensorWithoutNode(const c10::StorageImpl* storage);
+  static bool IsTensorWithoutNode(const at::Tensor& tensor);
+
+  static void RetainGraphDataTensor(const at::Tensor& data_tensor);
+};
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index aeac53150dbc31ac8dca0edff0c826a9528d9ee9..cf6b6da127821cc997f01517e3ed85c5cef6b37a 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -40,8 +40,10 @@
 
 #include "third_party/acl/inc/acl/acl.h"
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
+#include "torch_npu/csrc/framework/graph/execute/GraphExecutor.h"
 #include "torch_npu/csrc/profiler/cann_profiling.h"
 #include "torch_npu/csrc/profiler/e2e_profiler.h"
+#include "torch_npu/csrc/core/npu/NPURunMode.h"
 
 static PyObject* THNPModule_initExtension(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
@@ -175,6 +177,44 @@ PyObject * THNPModule_setStream_wrap(PyObject *self, PyObject *obj)
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THNPModule_enable_graph_mode_wrap(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  pybind11::gil_scoped_release no_gil;
+  c10_npu::NpuRunMode::SetNpuRunMode(c10_npu::ModeKind::GRAPH_MODE);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THNPModule_disable_graph_mode_wrap(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  pybind11::gil_scoped_release no_gil;
+  at_npu::native::GraphExecutor::GetInstance().ConstructAndExecuteGraph();
+  c10_npu::NpuRunMode::SetNpuRunMode(c10_npu::ModeKind::SINGLE_OP_MODE);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THNPModule_launch_graph_wrap(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  pybind11::gil_scoped_release no_gil;
+  at_npu::native::GraphExecutor::GetInstance().ConstructAndExecuteGraph();
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THNPModule_is_graph_mode_wrap(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  pybind11::gil_scoped_release no_gil;
+  auto is_graph_mode = c10_npu::NpuRunMode::IsGraphMode();
+  if (is_graph_mode) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+
 PyObject * THNPModule_emptyCache(PyObject *_unused, PyObject *noargs)
 {
   HANDLE_TH_ERRORS
@@ -476,6 +516,11 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_getCurrentStream", (PyCFunction)THNPModule_getCurrentStream_wrap, METH_O, nullptr},
     {"_npu_getDefaultStream", (PyCFunction)THNPModule_getDefaultStream_wrap, METH_O, nullptr},
     {"_npu_setStream", (PyCFunction)THNPModule_setStream_wrap,  METH_O, nullptr},
+    {"_npu_setStream", (PyCFunction)THNPModule_setStream_wrap,  METH_O, nullptr},
+    {"_npu_enable_graph_mode", (PyCFunction)THNPModule_enable_graph_mode_wrap, METH_NOARGS, nullptr},
+    {"_npu_disable_graph_mode", (PyCFunction)THNPModule_disable_graph_mode_wrap, METH_NOARGS, nullptr},
+    {"_npu_launch_graph", (PyCFunction)THNPModule_launch_graph_wrap, METH_NOARGS, nullptr},
+    {"_npu_is_graph_mode", (PyCFunction)THNPModule_is_graph_mode_wrap, METH_NOARGS, nullptr},
     {"_npu_emptyCache", (PyCFunction) THNPModule_emptyCache, METH_NOARGS, nullptr},
     {"_npu_memoryStats", (PyCFunction) THNPModule_memoryStats, METH_O, nullptr},
     {"_npu_resetAccumulatedMemoryStats", (PyCFunction) THNPModule_resetAccumulatedMemoryStats, METH_O, nullptr},
diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py
index c370fd20ea8bd8df2d507c6400d4fb9d99d0cc74..5b2686b359ab187fa4d522dc5d5c2d5728282122 100644
--- a/torch_npu/npu/__init__.py
+++ b/torch_npu/npu/__init__.py
@@ -44,6 +44,7 @@ from .memory import (_free_mutex, caching_allocator_alloc, caching_allocator_del
                      max_memory_allocated, memory_reserved, max_memory_reserved,
                      memory_cached, max_memory_cached, memory_snapshot, memory_summary)
 from .streams import Stream, Event
+from .graph import is_graph_mode, disable_graph_mode, enable_graph_mode, launch_graph
 from . import profiler
 from .npu_frontend_enhance import (set_option, set_aoe, profile, prof_init,
             prof_start, prof_stop, prof_finalize, profileConfig)
\ No newline at end of file
diff --git a/torch_npu/npu/graph.py b/torch_npu/npu/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..439ec26fc255d213260a6bfc66ec6eadf382e57b
--- /dev/null
+++ b/torch_npu/npu/graph.py
@@ -0,0 +1,22 @@
+import torch_npu
+from .utils import _lazy_init
+
+
+def enable_graph_mode():
+    torch_npu._C._npu_enable_graph_mode()
+
+
+def disable_graph_mode():
+    _lazy_init()
+    torch_npu._C._npu_disable_graph_mode()
+
+
+def is_graph_mode() -> bool:
+    return torch_npu._C._npu_is_graph_mode()
+
+
+def launch_graph():
+    _lazy_init()
+    if not is_graph_mode():
+        raise RuntimeError("Npu run mode must be graph mode when launch graph")
+    torch_npu._C._npu_launch_graph()
\ No newline at end of file
diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py
index 332995804fd56d4517d4ebb128740d828caaf6cf..c9a295ef213d1122625b6f2ff14d694e75031061 100644
--- a/torch_npu/utils/module.py
+++ b/torch_npu/utils/module.py
@@ -35,8 +35,14 @@ def npu(self, device=None):
     if device is None:
         device = torch.device("npu")
     if torch_npu.npu.is_available():
+        # Ref [cast weight in single op mode]
+        is_graph_mode = torch_npu.npu.is_graph_mode()
+        if is_graph_mode:
+            torch_npu.npu.disable_graph_mode()
         with torch.no_grad():
             self.cast_weight(device)
+        if is_graph_mode:
+            torch_npu.npu.enable_graph_mode()
     return self._apply(lambda t: t.npu(device))
 
 
@@ -55,7 +61,14 @@ def to(self, *args, **kwargs):
                 "if a complex module does not work as expected.")
     if torch_npu.npu.is_available():
         with torch.no_grad():
-            self.cast_weight(device)
+            # Ref [cast weight in single op mode]
+            is_graph_mode = torch_npu.npu.is_graph_mode()
+            if is_graph_mode:
+                torch_npu.npu.disable_graph_mode()
+            with torch.no_grad():
+                self.cast_weight(device)
+            if is_graph_mode:
+                torch_npu.npu.enable_graph_mode();
 
     def convert(t):
         if convert_to_format is not None and t.dim() == 4: