diff --git a/tf_adapter_2.x/CI_build b/tf_adapter_2.x/CI_build
new file mode 100644
index 0000000000000000000000000000000000000000..9e1d1c2c12443a5ea607dac25825d012f655c2de
--- /dev/null
+++ b/tf_adapter_2.x/CI_build
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+cd $(cd "$(dirname $0)"; pwd)
+
+rm -rf build
+mkdir build
+cd build
+cmake .. -DPYTHON_BIN_PATH=$(which python3) -DPYTHON_INCLUDE_DIR=$(python3 -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())") -DASCEND_CI_BUILD_DIR=$(cd $(pwd)/../../../; pwd)
+make -j8
diff --git a/tf_adapter_2.x/CMakeLists.txt b/tf_adapter_2.x/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ab3aa1e3153c0c4f7ddd30650c6856b26ea65b92
--- /dev/null
+++ b/tf_adapter_2.x/CMakeLists.txt
@@ -0,0 +1,65 @@
+cmake_minimum_required(VERSION 3.14)
+project(NpuDevice)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_C_FLAGS "-O2 -DNDEBUG -Wno-deprecated-declarations -Wall -fPIC -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -s -pipe ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-O2 -DNDEBUG -Wno-deprecated-declarations -Wall -fPIC -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -s -pipe ${CMAKE_CXX_FLAGS}")
+
+
+if (DEFINED ASCEND_CI_BUILD_DIR)
+    set(CMAKE_C_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=0 ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=0 ${CMAKE_CXX_FLAGS}")
+    include_directories(${PYTHON_INCLUDE_DIR})
+else()
+    if (NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/tools/COMPILE_FLAGS OR NOT EXISTS
+            ${CMAKE_CURRENT_LIST_DIR}/tools/TF_INSTALLED_PATH OR NOT EXISTS
+            ${CMAKE_CURRENT_LIST_DIR}/tools/ASCEND_INSTALLED_PATH OR NOT EXISTS
+            ${CMAKE_CURRENT_LIST_DIR}/tools/PYTHON_BIN_PATH)
+        message(FATAL_ERROR "No validate configuration found. Did you forget to configure first?")
+    endif ()
+
+    file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/tools/TF_INSTALLED_PATH" TF_INSTALLED_PATH)
+    file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/tools/ASCEND_INSTALLED_PATH" ASCEND_INSTALLED_PATH)
+    file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/tools/PYTHON_BIN_PATH" PYTHON_BIN_PATH)
+
+    file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/tools/COMPILE_FLAGS" CUSTOM_COMPILE_FLAGS)
+    foreach (COMPILE_FLAG ${CUSTOM_COMPILE_FLAGS})
+        set(CMAKE_C_FLAGS "${COMPILE_FLAG} ${CMAKE_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${COMPILE_FLAG} ${CMAKE_CXX_FLAGS}")
+    endforeach (COMPILE_FLAG)
+endif ()
+
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/acl/module.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/tensorflow/module.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/graph_engine/module.cmake)
+
+include_directories(${CMAKE_CURRENT_LIST_DIR}/npu_device/core)
+
+file(COPY ${CMAKE_CURRENT_LIST_DIR}/npu_device/python DESTINATION ${CMAKE_BINARY_DIR}/dist)
+
+file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_LIST_DIR}/npu_device/*.cpp)
+
+IF (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU)
+    add_definitions(-Wno-builtin-macro-redefined)
+ENDIF ()
+
+foreach (CPP_SOURCE ${SOURCES})
+    file(RELATIVE_PATH RELATIVE_CPP_SOURCE ${CMAKE_CURRENT_LIST_DIR} ${CPP_SOURCE})
+    set_property(SOURCE ${CPP_SOURCE} PROPERTY COMPILE_DEFINITIONS __FILE__=\"${RELATIVE_CPP_SOURCE}\")
+endforeach (CPP_SOURCE)
+
+add_library(_npu_device_backends SHARED ${SOURCES})
+set_target_properties(_npu_device_backends PROPERTIES PREFIX "")
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/dist/python/npu_device)
+
+target_link_libraries(_npu_device_backends PRIVATE
+        tensorflow_libs
+        ge_libs
+        acl_libs)
+
+add_custom_command(TARGET _npu_device_backends
+        POST_BUILD
+        COMMAND cd ${CMAKE_BINARY_DIR}/dist/python/ && ${PYTHON_BIN_PATH} setup.py bdist_wheel
+        VERBATIM)
+
+install(CODE "execute_process(COMMAND ${PYTHON_BIN_PATH} -m pip install ${CMAKE_BINARY_DIR}/dist/python/dist/npu_device-0.1-py3-none-any.whl --upgrade)")
diff --git a/tf_adapter_2.x/README.md b/tf_adapter_2.x/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b988f7feca8a8556102dc25ba76e1f77b86be173
--- /dev/null
+++ b/tf_adapter_2.x/README.md
@@ -0,0 +1,99 @@
+# Ascend Adapter for TF2.X
+## 安装
+
+### 从源码安装
+
+您可以从源代码构建 Ascend Adapter 软件包并将其安装在昇腾AI处理器环境上。
+> Ascend Adapter 与 Tensorflow 有严格的匹配关系，从源码构建前，您需要确保已经正确安装了[Tensorflow v2.4.0版本](https://www.tensorflow.org/install) 。
+
+
+同时系统满足以下要求：
+- Linux OS
+- GCC >= 7.3.0
+- CMake >= 3.14.0
+
+#### 下载源码
+
+```
+git clone ssh://git@10.95.128.221:2222/x00373192/AscendTF2.git
+cd AscendTF2
+```
+
+#### 配置安装环境
+```BASH
+./configure
+```
+默认情况下，执行上述命会弹出如下的交互式会话窗口
+> 您的会话可能有所不同。
+
+```BASH
+Please specify the location of python with available tensorflow v2.4.0 installed. [Default is /usr/bin/python3]
+(You can make this quiet by set env [ADAPTER_TARGET_PYTHON_PATH]):
+```
+此时，要求您输入安装了 Tensorflow v2.4.0 版本的python解释器路径，如果默认路径是正确的，直接回车，否则请输入正确的 python
+解释器路径。
+> 您可以通过设置 ADAPTER_TARGET_PYTHON_PATH的环境变量，来抑制交互式窗口弹出，但是要确保路径是有效的，否则，仍然会要求您输入正确的 python 解释器路径。
+
+键入后，会耗费几秒钟以确保您的输入是有效的，接着，会弹出下面的交互式窗口
+```
+Please specify the location of ascend. [Default is /usr/local/Ascend]
+(You can make this quiet by set env [ASCEND_INSTALLED_PATH]):
+```
+此时，要求您输入昇腾处理器开发套件的安装路径，如果默认路径是正确的，直接回车，否则请输入正确的昇腾处理器开发套件安装路径。
+
+> 您可以通过设置ASCEND_INSTALLED_PATH的环境变量，来抑制交互式窗口弹出，但是要确保路径是有效的，否则，仍然会要求您输入正确的昇腾处理器开发套件安装路径。
+
+键入后，等待配置完成。
+#### 配置cmake
+> 根据您的网络状况，可能需要数分钟来下载Ascend Adapter的依赖项目以完成配置。
+
+```
+mkdir build
+cd build
+cmake ..
+```
+
+#### 执行编译
+> 您应当根据实际编译环境，设置合适的并发编译数以提升编译速度。
+
+```BASH
+make -j8
+```
+
+编译结束后，安装包会生成在
+```
+./dist/python/dist/npu_device-0.1-py3-none-any.whl
+```
+
+#### 安装
+您可以继续执行
+```BASH
+make install
+```
+将Ascend Adapter安装到配置时指定的 python 解释器包目录下，或者使用 pip3 安装 Ascend Adapter 到您期望的位置。
+```
+pip3 install ./dist/python/dist/npu_device-0.1-py3-none-any.whl --upgrade
+```
+需要注意的是， 您应当保证安装路径与您编译时指定的 python 解释器搜索路径是一致的。
+
+#### 基础功能测试
+在执行脚本前，您应当将昇腾处理器开发套件的库目录加入搜索目录，如您的安装目录是/usr/local/Ascend，则应当执行如下命令
+```
+export LD_LIBRARY_PATH=/usr/local/Ascend/fwkacllib/lib64/
+```
+之后，可以执行示例脚本检测您的安装
+```
+python3 examples/basic_tests.py
+```
+
+## 贡献
+
+psuh代码前，请务必保证已经完成了基础功能测试和网络测试！
+
+## Release Notes
+
+Release Notes请参考[RELEASE](RELEASE.md).
+
+## License
+
+[Apache License 2.0](LICENSE)
diff --git a/tf_adapter_2.x/RELEASE.md b/tf_adapter_2.x/RELEASE.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tf_adapter_2.x/cmake/acl/module.cmake b/tf_adapter_2.x/cmake/acl/module.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8d81abded26ec1b358e4ff72379af045d06a2fe5
--- /dev/null
+++ b/tf_adapter_2.x/cmake/acl/module.cmake
@@ -0,0 +1,22 @@
+add_library(acl_libs INTERFACE)
+
+if(DEFINED ASCEND_INSTALLED_PATH)
+    include_directories(${ASCEND_INSTALLED_PATH}/fwkacllib/include)
+    target_link_libraries(acl_libs INTERFACE
+            ${ASCEND_INSTALLED_PATH}/fwkacllib/lib64/libascendcl.so
+            ${ASCEND_INSTALLED_PATH}/fwkacllib/lib64/libacl_op_compiler.so)
+else()
+    include_directories(${ASCEND_CI_BUILD_DIR}/inc/external)
+    add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc
+            COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc
+    )
+
+    set(fake_sources ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc)
+
+    add_library(ascendcl SHARED ${fake_sources})
+    add_library(acl_op_compiler SHARED ${fake_sources})
+    target_link_libraries(acl_libs INTERFACE
+            ascendcl
+            acl_op_compiler)
+endif()
\ No newline at end of file
diff --git a/tf_adapter_2.x/cmake/graph_engine/module.cmake b/tf_adapter_2.x/cmake/graph_engine/module.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0985d7110ea3aefbff78e6d6e20c27e93650d44a
--- /dev/null
+++ b/tf_adapter_2.x/cmake/graph_engine/module.cmake
@@ -0,0 +1,28 @@
+add_library(ge_libs INTERFACE)
+
+if(DEFINED ASCEND_INSTALLED_PATH)
+    include_directories(${CMAKE_CURRENT_LIST_DIR}/../../../inc/graphengine/inc)
+    include_directories(${CMAKE_CURRENT_LIST_DIR}/../../../inc/graphengine/inc/external)
+    include_directories(${CMAKE_CURRENT_LIST_DIR}/../../../inc/metadef/inc)
+    include_directories(${CMAKE_CURRENT_LIST_DIR}/../../../inc/metadef/inc/external)
+    target_link_libraries(ge_libs INTERFACE
+            ${ASCEND_INSTALLED_PATH}/fwkacllib/lib64/libge_runner.so
+            ${ASCEND_INSTALLED_PATH}/fwkacllib/lib64/libfmk_parser.so)
+else()
+    include_directories(${ASCEND_CI_BUILD_DIR}/graphengine/inc)
+    include_directories(${ASCEND_CI_BUILD_DIR}/graphengine/inc/external)
+    include_directories(${ASCEND_CI_BUILD_DIR}/metadef/inc)
+    include_directories(${ASCEND_CI_BUILD_DIR}/metadef/inc/external)
+    add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc
+            COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc
+    )
+
+    set(fake_sources ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc)
+
+    add_library(ge_runner SHARED ${fake_sources})
+    add_library(fmk_parser SHARED ${fake_sources})
+    target_link_libraries(ge_libs INTERFACE
+            ge_runner
+            fmk_parser)
+endif()
\ No newline at end of file
diff --git a/tf_adapter_2.x/cmake/tensorflow/module.cmake b/tf_adapter_2.x/cmake/tensorflow/module.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..72b39d7e6205c664c12828b0ded1fa727e72f605
--- /dev/null
+++ b/tf_adapter_2.x/cmake/tensorflow/module.cmake
@@ -0,0 +1,30 @@
+add_library(tensorflow_libs INTERFACE)
+
+if(DEFINED TF_INSTALLED_PATH)
+    SET(TF_INCLUDE_DIR ${TF_INSTALLED_PATH})
+    target_link_libraries(tensorflow_libs INTERFACE
+            ${TF_INSTALLED_PATH}/python/_pywrap_tensorflow_internal.so
+            ${TF_INSTALLED_PATH}/libtensorflow_framework.so.2)
+else()
+    add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc
+            COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc
+    )
+
+    set(fake_sources ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc)
+
+    add_library(tensorflow_framework SHARED ${fake_sources})
+    set_target_properties(tensorflow_framework PROPERTIES VERSION 2)
+
+    add_library(pywrap_tensorflow_internal SHARED ${fake_sources})
+    set_target_properties(pywrap_tensorflow_internal PROPERTIES PREFIX _)
+
+    SET(TF_INCLUDE_DIR ${ASCEND_CI_BUILD_DIR}/third_party/tensorflow/compile_deps/tf-2.4.0)
+    target_link_libraries(tensorflow_libs INTERFACE
+            tensorflow_framework
+            pywrap_tensorflow_internal)
+endif()
+
+include_directories(${TF_INCLUDE_DIR}/include)
+include_directories(${TF_INCLUDE_DIR}/include/external/farmhash_archive/src)
+include_directories(${TF_INCLUDE_DIR}/include/external/pybind11/_virtual_includes/pybind11)
\ No newline at end of file
diff --git a/tf_adapter_2.x/compile b/tf_adapter_2.x/compile
new file mode 100644
index 0000000000000000000000000000000000000000..2f24227dacb29366b4d4a2927ae65e675fcc035f
--- /dev/null
+++ b/tf_adapter_2.x/compile
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+rm -rf build
+mkdir build
+cd build
+cmake ..
+make -j8
diff --git a/tf_adapter_2.x/configure b/tf_adapter_2.x/configure
new file mode 100644
index 0000000000000000000000000000000000000000..a953879ec96b3860b016b111c38c2b1ad419ef84
--- /dev/null
+++ b/tf_adapter_2.x/configure
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+if [ -z "$PYTHON_BIN_PATH" ]; then
+  PYTHON_BIN_PATH=$(which python3 || which python || true)
+fi
+
+# Set all env variables
+CONFIGURE_DIR=$(dirname "$0")
+"$PYTHON_BIN_PATH" "${CONFIGURE_DIR}/configure.py" "$@"
+
+echo "Configuration finished"
diff --git a/tf_adapter_2.x/configure.py b/tf_adapter_2.x/configure.py
new file mode 100644
index 0000000000000000000000000000000000000000..393d457923426f04dffaaf12fb4e70f99fa277e4
--- /dev/null
+++ b/tf_adapter_2.x/configure.py
@@ -0,0 +1,136 @@
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import subprocess
+import sys
+
+try:
+    from shutil import which
+except ImportError:
+    from distutils.spawn import find_executable as which
+
+_COMPAT_TENSORFLOW_VERSION = "2.4.0"
+_PYTHON_BIN_PATH_ENV = "ADAPTER_TARGET_PYTHON_PATH"
+_ASCEND_INSTALLED_PATH_ENV = "ASCEND_INSTALLED_PATH"
+
+def run_command(cmd):
+    output = subprocess.check_output(cmd)
+    return output.decode('UTF-8').strip()
+
+
+def get_input(question):
+    try:
+        try:
+            answer = raw_input(question)
+        except NameError:
+            answer = input(question)
+    except EOFError:
+        answer = ''
+    return answer
+
+
+def real_config_path(file):
+    return os.path.join("tools", file)
+
+
+def setup_python(env_path):
+    """Get python install path."""
+    default_python_bin_path = sys.executable
+    ask_python_bin_path = ('Please specify the location of python with valid '
+                           'tensorflow 2.4.0 site-packages installed. [Default '
+                           'is %s]\n(You can make this quiet by set env [ADAPTER_TARGET_PYTHON_PATH]): ') % default_python_bin_path
+    custom_python_bin_path = env_path
+    while True:
+        if not custom_python_bin_path:
+            python_bin_path = get_input(ask_python_bin_path)
+        else:
+            python_bin_path = custom_python_bin_path
+            custom_python_bin_path = None
+        if not python_bin_path:
+            python_bin_path = default_python_bin_path
+            pass
+        # Check if the path is valid
+        if os.path.isfile(python_bin_path) and os.access(python_bin_path, os.X_OK):
+            pass
+        elif not os.path.exists(python_bin_path):
+            print('Invalid python path: %s cannot be found.' % python_bin_path)
+            continue
+        else:
+            print('%s is not executable.  Is it the python binary?' % python_bin_path)
+            continue
+
+        try:
+            compile_args = run_command([
+                python_bin_path, '-c',
+                'import distutils.sysconfig; import tensorflow as tf; print(tf.__version__ + "|" + tf.sysconfig.get_lib('
+                ') + "|" + "|".join(tf.sysconfig.get_compile_flags()) + "|" + distutils.sysconfig.get_python_inc())'
+            ]).split("|")
+            if not compile_args[0].startswith(_COMPAT_TENSORFLOW_VERSION):
+                print('Invalid python path: %s compat tensorflow version is %s'
+                      ' got %s.' % (python_bin_path, _COMPAT_TENSORFLOW_VERSION,
+                                    compile_args[0]))
+                continue
+        except subprocess.CalledProcessError:
+            print('Invalid python path: %s tensorflow not installed.' %
+                  python_bin_path)
+            continue
+        # Write tools/python_bin_path.sh
+        with open(real_config_path('PYTHON_BIN_PATH'), 'w') as f:
+            f.write(python_bin_path)
+        with open(real_config_path('COMPILE_FLAGS'), 'w') as f:
+            for flag in compile_args[2:-1]:
+                f.write(flag + '\n')
+            f.write("-I" + compile_args[-1] + '\n')
+        with open(real_config_path('TF_INSTALLED_PATH'), 'w') as f:
+            f.write(compile_args[1])
+        break
+
+
+def setup_ascend(env_path):
+    """Get ascend install path."""
+    default_ascend_path = "/usr/local/Ascend"
+    ask_ascend_path = ('Please specify the location of ascend. [Default is '
+                       '%s]\n(You can make this quiet by set env [ASCEND_INSTALLED_PATH]): ') % default_ascend_path
+    custom_ascend_path = env_path
+    while True:
+        if not custom_ascend_path:
+            ascend_path = get_input(ask_ascend_path)
+        else:
+            ascend_path = custom_ascend_path
+            custom_ascend_path = None
+        if not ascend_path:
+            ascend_path = default_ascend_path
+        # Check if the path is valid
+        if os.path.isdir(ascend_path) and os.access(ascend_path, os.X_OK):
+            break
+        elif not os.path.exists(ascend_path):
+            print('Invalid ascend path: %s cannot be found.' % ascend_path)
+
+    with open(real_config_path('ASCEND_INSTALLED_PATH'), 'w') as f:
+        f.write(ascend_path)
+
+
+def main():
+    env_snapshot = dict(os.environ)
+    setup_python(env_snapshot.get(_PYTHON_BIN_PATH_ENV))
+    setup_ascend(env_snapshot.get(_ASCEND_INSTALLED_PATH_ENV))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tf_adapter_2.x/docs/framework.jpg b/tf_adapter_2.x/docs/framework.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e7729bfb507a80ad8423c67d7dd41d6a16ff691d
Binary files /dev/null and b/tf_adapter_2.x/docs/framework.jpg differ
diff --git a/tf_adapter_2.x/examples/basic_tests.py b/tf_adapter_2.x/examples/basic_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..998302458d78d5a10de801b9b226c1583988742a
--- /dev/null
+++ b/tf_adapter_2.x/examples/basic_tests.py
@@ -0,0 +1,440 @@
+import unittest
+import os
+import tensorflow as tf
+from tensorflow.python.eager import context
+from tensorflow.python.ops import gen_resource_variable_ops
+
+import npu_device
+
+npu = npu_device.open().as_default()
+
+
+def tensor_equal(t1, t2):
+  return (t1.numpy() == t2.numpy()).all()
+
+
+@tf.function
+def foo_add(v1, v2):
+  return v1 + v2
+
+
+@tf.function
+def foo_add_(v):
+  return v.assign_add(1)
+
+
+@tf.function
+def foo_cpu_add_(v):
+  with context.device("/job:localhost/replica:0/task:0/device:CPU:0"):
+    return v.assign_add(1)
+
+
+class RaiseTest(unittest.TestCase):
+  def test_raise1(self):
+    with context.device("/job:localhost/replica:0/task:0/device:CPU:0"):
+      x = tf.Variable(1)
+    y = tf.Variable(1)
+    self.assertRaises(tf.errors.InvalidArgumentError, foo_add, x, y)
+
+  def test_basic1(self):
+    self.assertTrue(tensor_equal(foo_add(1, 2), tf.constant(3)))
+
+  def test_basic2(self):
+    self.assertTrue(tensor_equal(tf.add(1, 2), tf.constant(3)))
+
+  def test_basic3(self):
+    x = tf.Variable(1)
+    self.assertTrue(tensor_equal(foo_add_(x), tf.constant(2)))
+
+  def test_basic4(self):
+    with context.device("/job:localhost/replica:0/task:0/device:CPU:0"):
+      x = tf.Variable(1)
+    self.assertTrue(tensor_equal(foo_add_(x), tf.constant(2)))
+
+  def test_basic5(self):
+    with context.device("/job:localhost/replica:0/task:0/device:CPU:0"):
+      x = tf.Variable(1)
+    self.assertTrue(tensor_equal(foo_cpu_add_(x), tf.constant(2)))
+
+  def test_basic6(self):  # Force run on npu by tensorflow
+    x = tf.Variable(1)
+    self.assertTrue(tensor_equal(foo_cpu_add_(x), tf.constant(2)))
+
+  def test_basic7(self):  # Force run on npu by tensorflow
+    x = tf.Variable(1)
+    self.assertTrue(x.device == npu.name())
+    self.assertTrue(foo_cpu_add_(x).device == "/job:localhost/replica:0/task:0/device:CPU:0")
+    with context.device("/job:localhost/replica:0/task:0/device:CPU:0"):
+      x = tf.Variable(1)
+    self.assertTrue(foo_add_(x).device == "/job:localhost/replica:0/task:0/device:CPU:0")
+
+  def test_shared_variable(self):
+    x = gen_resource_variable_ops.var_handle_op(dtype=tf.float32, shape=(1, 2), shared_name="variable_1")
+    gen_resource_variable_ops.assign_variable_op(x, tf.constant([[1.0, 2.0]]))
+    y = gen_resource_variable_ops.var_handle_op(dtype=tf.float32, shape=(1, 2), shared_name="variable_1")
+    gen_resource_variable_ops.assign_variable_op(y, tf.constant([[2.0, 3.0]]))
+    read_x = gen_resource_variable_ops.read_variable_op(x, dtype=tf.float32)
+    read_y = gen_resource_variable_ops.read_variable_op(y, dtype=tf.float32)
+    self.assertTrue(tensor_equal(read_x, read_y))
+
+    x = gen_resource_variable_ops.var_handle_op(dtype=tf.float32, shape=(1, 2), shared_name=context.shared_name())
+    gen_resource_variable_ops.assign_variable_op(x, tf.constant([[1.0, 2.0]]))
+    y = gen_resource_variable_ops.var_handle_op(dtype=tf.float32, shape=(1, 2), shared_name=context.shared_name())
+    gen_resource_variable_ops.assign_variable_op(y, tf.constant([[2.0, 3.0]]))
+    read_x = gen_resource_variable_ops.read_variable_op(x, dtype=tf.float32)
+    read_y = gen_resource_variable_ops.read_variable_op(y, dtype=tf.float32)
+    self.assertFalse(tensor_equal(read_x, read_y))
+
+  def test_anonymous_variable(self):
+    x = tf.Variable([[1.0, 2.0]], dtype=tf.float32, name="x")
+    y = tf.Variable([[1.0, 2.0]], dtype=tf.float32, name="x")
+    x.assign_add([[1.0, 1.0]])
+    self.assertFalse(tensor_equal(x, y))
+
+  def test_matmul(self):
+    input = tf.constant([[1.0], [2.0]])
+    weight = tf.Variable([[2.0, 1.0]], dtype=tf.float32)
+    logit = tf.matmul(input, weight)
+    self.assertTrue(tensor_equal(logit, tf.constant([[2., 1.], [4., 2.]])))
+
+  def test_unique(self):
+    x = tf.constant([1, 1, 2, 4, 4, 4, 7, 8, 8])
+    y, idx = tf.unique(x)
+    self.assertTrue(tensor_equal(y, tf.constant([1, 2, 4, 7, 8])))
+
+  def test_dataset(self):
+    dataset = tf.data.Dataset.from_tensor_slices(tf.constant([2]))
+    iterator = iter(dataset)
+    self.assertTrue(tensor_equal(next(iterator), tf.constant(2)))
+    try:
+      next(iterator)
+    except Exception as e:
+      self.assertTrue(isinstance(e, StopIteration))
+
+  def test_dataset_function(self):
+    dataset = tf.data.Dataset.from_tensor_slices(tf.constant([2]))
+    iterator = iter(dataset)
+
+    @tf.function
+    def f(iterator):
+      return next(iterator)
+
+    self.assertTrue(tensor_equal(f(iterator), tf.constant(2)))
+    self.assertRaises(tf.errors.OutOfRangeError, f, iterator)
+
+  def test_checkpoint(self):
+    step = tf.Variable(0, name="step")  # 0
+    checkpoint = tf.train.Checkpoint(step=step)
+    checkpoint.write("./ckpt")
+    step.assign_add(1)  # 1
+    checkpoint.read("./ckpt")
+    self.assertTrue(tensor_equal(step, tf.constant(0)))
+
+  def test_same_python_name_function(self):
+    def f1():
+      @tf.function
+      def f(x):
+        return x + 1
+
+      return f(tf.constant(1))
+
+    def f2():
+      @tf.function
+      def f(x):
+        return x + 2
+
+      return f(tf.constant(1))
+
+    self.assertTrue(tensor_equal(f1(), tf.constant(2)))
+    self.assertTrue(tensor_equal(f2(), tf.constant(3)))
+
+  def test_cond1(self):
+    cond = tf.Variable(1.0)
+    x = tf.Variable(1.0)
+    y = tf.Variable(2.0)
+
+    @tf.function
+    def f():
+      tf.cond(cond < tf.constant(2.0), lambda: x.assign_add(y), lambda: y.assign_add(x))
+      return x, y
+
+    v1, v2 = f()
+    self.assertTrue(tensor_equal(v1, tf.constant(3.0)))
+    self.assertTrue(tensor_equal(v2, tf.constant(2.0)))
+
+  def test_cond2(self):
+    cond = tf.Variable(1.0)
+    x = tf.Variable(0.0)
+    y = tf.Variable(0.0)
+
+    @tf.function
+    def f():
+      tf.cond(cond < tf.constant(2.0), lambda: x.assign_add(1.0), lambda: y.assign_add(1.0))
+      return x, y
+
+    v1, v2 = f()
+    self.assertTrue(tensor_equal(v1, tf.constant(1.0)))
+    self.assertTrue(tensor_equal(v2, tf.constant(0.0)))
+
+  def test_cond3(self):
+    v = tf.Variable(1.0)
+    x = tf.Variable(0.0)
+    y = tf.Variable(0.0)
+
+    def x_add():
+      return x.assign_add(1.0)
+
+    def y_add():
+      return y.assign_add(1.0)
+
+    @tf.function
+    def f():
+      tf.cond(v < tf.constant(2.0), x_add, y_add)
+      return x, y
+
+    v1, v2 = f()
+    self.assertTrue(tensor_equal(v1, tf.constant(1.0)))
+    self.assertTrue(tensor_equal(v2, tf.constant(0.0)))
+
+  def test_cond4(self):
+    v = tf.Variable(1.0)
+    x = tf.Variable(0.0)
+    y = tf.Variable(0.0)
+
+    @tf.function
+    def x_add():
+      return x.assign_add(1.0)
+
+    @tf.function
+    def y_add():
+      return y.assign_add(1.0)
+
+    @tf.function
+    def f():
+      tf.cond(v < tf.constant(2.0), x_add, y_add)
+      return x, y
+
+    v1, v2 = f()
+    self.assertTrue(tensor_equal(v1, tf.constant(1.0)))
+    self.assertTrue(tensor_equal(v2, tf.constant(0.0)))
+
+  def test_cond5(self):
+    v = tf.Variable(1.0)
+    x = tf.Variable(0.0)
+    y = tf.Variable(0.0)
+
+    c = tf.constant(1.0)
+
+    @tf.function
+    def x_add():
+      return x.assign_add(c)
+
+    @tf.function
+    def y_add():
+      return y.assign_add(c)
+
+    @tf.function
+    def f():
+      tf.cond(v < tf.constant(2.0), x_add, y_add)
+      return x, y
+
+    v1, v2 = f()
+    self.assertTrue(tensor_equal(v1, tf.constant(1.0)))
+    self.assertTrue(tensor_equal(v2, tf.constant(0.0)))
+
+  def test_cond6(self):
+    cond = tf.Variable(1.0)
+    x = tf.Variable(1.0)
+    y = tf.Variable(2.0)
+
+    @tf.function
+    def f():
+      return tf.cond(cond < tf.constant(2.0), lambda: x.assign_add(y), lambda: y.assign_add(x))
+
+    self.assertTrue(tensor_equal(f(), tf.constant(3.0)))
+
+  def test_while(self):
+    v = tf.Variable(1.0)
+
+    @tf.function
+    def f():
+      for i in tf.range(10):
+        v.assign_add(1.0)
+      return v
+
+    self.assertTrue(tensor_equal(f(), tf.constant(11.0)))
+
+  def test_variable_need_different_format_in_subgraph_with_control(self):
+    x = tf.Variable(tf.constant([[[[0.0]]]]), dtype=tf.float32, shape=(1, 1, 1, 1))
+
+    @tf.function
+    def f():
+      xv = tf.cond(x < tf.constant([[[[2.0]]]]), lambda: x.assign(tf.constant([[[[10.0]]]])),
+              lambda: x.assign(tf.constant([[[[20.0]]]])))
+      return tf.nn.conv2d(xv, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID')
+    self.assertTrue(tensor_equal(f(), tf.constant([[[[30.0]]], ], dtype=tf.float32)))
+    self.assertTrue(tensor_equal(x, tf.constant([[[[10.0]]], ], dtype=tf.float32)))
+
+  def test_variable_need_different_format_in_subgraph(self):
+    x = tf.Variable(tf.constant([[[[0.0]]]]), dtype=tf.float32, shape=(1, 1, 1, 1))
+
+    @tf.function
+    def f():
+      tf.cond(x < tf.constant([[[[2.0]]]]), lambda: x.assign(tf.constant([[[[10.0]]]])),
+                   lambda: x.assign(tf.constant([[[[20.0]]]])))
+      return tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID')
+    self.assertTrue(tensor_equal(f(), tf.constant([[[[30.0]]], ], dtype=tf.float32)))
+    self.assertTrue(tensor_equal(x, tf.constant([[[[10.0]]], ], dtype=tf.float32)))
+
+  def test_variable_need_different_format_in_subgraph_cross(self):
+    x = tf.Variable(tf.constant([[[[10.0]]]]), dtype=tf.float32, shape=(1, 1, 1, 1))
+
+    @tf.function
+    def f():
+      c1 = tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID')
+      tf.cond(x < tf.constant([[[[2.0]]]]), lambda: x.assign(tf.constant([[[[10.0]]]])),
+              lambda: x.assign(tf.constant([[[[20.0]]]])))
+      return c1, tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID')
+    c1, c2 = f()
+    self.assertTrue(tensor_equal(c1, tf.constant([[[[30.0]]], ], dtype=tf.float32)))
+    self.assertTrue(tensor_equal(c2, tf.constant([[[[60.0]]], ], dtype=tf.float32)))
+    self.assertTrue(tensor_equal(x, tf.constant([[[[20.0]]], ], dtype=tf.float32)))
+
+  def test_variable_need_different_format_in_subgraph_trans_merge(self):
+    x = tf.Variable(tf.constant([[[[10.0]]]]), dtype=tf.float32, shape=(1, 1, 1, 1))
+
+    @tf.function
+    def f():
+      c1 = tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID')
+      c2 = tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID')
+      tf.cond(x < tf.constant([[[[2.0]]]]), lambda: x.assign(tf.constant([[[[10.0]]]])),
+              lambda: x.assign(tf.constant([[[[20.0]]]])))
+      return c1, c2
+    c1, c2 = f()
+    self.assertTrue(tensor_equal(c1, tf.constant([[[[30.0]]], ], dtype=tf.float32)))
+    self.assertTrue(tensor_equal(c2, tf.constant([[[[30.0]]], ], dtype=tf.float32)))
+    self.assertTrue(tensor_equal(x, tf.constant([[[[20.0]]], ], dtype=tf.float32)))
+
+  def test_bert_dp_under_one_device_distribute_strategy(self):
+    def decode_record(record, name_to_features):
+      """Decodes a record to a TensorFlow example."""
+      example = tf.io.parse_single_example(record, name_to_features)
+
+      # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+      # So cast all int64 to int32.
+      for name in list(example.keys()):
+        t = example[name]
+        if t.dtype == tf.int64:
+          t = tf.cast(t, tf.int32)
+        example[name] = t
+
+      return example
+
+    def dataset_fn(ctx=None):
+      """Creates input dataset from (tf)records files for pretraining."""
+      input_patterns = [os.path.join(os.path.dirname(os.path.abspath(__file__)), "bert_examples.tfrecord")]
+      seq_length = 128
+      max_predictions_per_seq = 20
+      batch_size = 32
+      is_training = True
+      input_pipeline_context = None
+      use_next_sentence_label = True
+      use_position_id = False
+      output_fake_labels = True
+
+      name_to_features = {
+        'input_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+        'input_mask':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+        'segment_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+        'masked_lm_positions':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        'masked_lm_ids':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        'masked_lm_weights':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
+      }
+      if use_next_sentence_label:
+        name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1],
+                                                                         tf.int64)
+      if use_position_id:
+        name_to_features['position_ids'] = tf.io.FixedLenFeature([seq_length],
+                                                                 tf.int64)
+      for input_pattern in input_patterns:
+        if not tf.io.gfile.glob(input_pattern):
+          raise ValueError('%s does not match any files.' % input_pattern)
+
+      dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training)
+
+      if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+        dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                                input_pipeline_context.input_pipeline_id)
+      if is_training:
+        dataset = dataset.repeat()
+
+        # We set shuffle buffer to exactly match total number of
+        # training files to ensure that training data is well shuffled.
+        input_files = []
+        for input_pattern in input_patterns:
+          input_files.extend(tf.io.gfile.glob(input_pattern))
+        dataset = dataset.shuffle(len(input_files))
+
+      # # In parallel, create tf record dataset for each train files.
+      # # cycle_length = 8 means that up to 8 files will be read and deserialized in
+      # # parallel. You may want to increase this number if you have a large number of
+      # # CPU cores.
+      dataset = dataset.interleave(
+        tf.data.TFRecordDataset,
+        cycle_length=8,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+      if is_training:
+        dataset = dataset.shuffle(100)
+
+      decode_fn = lambda record: decode_record(record, name_to_features)
+      dataset = dataset.map(
+        decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+      def _select_data_from_record(record):
+        """Filter out features to use for pretraining."""
+        x = {
+          'input_word_ids': record['input_ids'],
+          'input_mask': record['input_mask'],
+          'input_type_ids': record['segment_ids'],
+          'masked_lm_positions': record['masked_lm_positions'],
+          'masked_lm_ids': record['masked_lm_ids'],
+          'masked_lm_weights': record['masked_lm_weights'],
+        }
+        if use_next_sentence_label:
+          x['next_sentence_labels'] = record['next_sentence_labels']
+        if use_position_id:
+          x['position_ids'] = record['position_ids']
+
+        # TODO(hongkuny): Remove the fake labels after migrating bert pretraining.
+        if output_fake_labels:
+          return (x, record['masked_lm_weights'])
+        else:
+          return x
+
+      dataset = dataset.map(
+        _select_data_from_record,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      dataset = dataset.batch(batch_size, drop_remainder=is_training)
+      dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    strategy = tf.distribute.OneDeviceStrategy("device:CPU:0")
+    dataset = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+    iterator = iter(dataset)
+
+    @tf.function
+    def bert_step(iterator):
+      return next(iterator)
+
+    bert_step(iterator)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tf_adapter_2.x/examples/bert_examples.tfrecord b/tf_adapter_2.x/examples/bert_examples.tfrecord
new file mode 100644
index 0000000000000000000000000000000000000000..ea54c17bc279cc7e8027a9b95831dcf9221539a4
Binary files /dev/null and b/tf_adapter_2.x/examples/bert_examples.tfrecord differ
diff --git a/tf_adapter_2.x/npu_device/core/npu_cache_spec.h b/tf_adapter_2.x/npu_device/core/npu_cache_spec.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f4787b45e18fe9fe288b69de2be41f7d826398b
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_cache_spec.h
@@ -0,0 +1,194 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_CACHE_SPEC_H
+#define TENSORFLOW_NPU_CACHE_SPEC_H
+
+#include "tensorflow/c/eager/c_api.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+
+#include "npu_logger.h"
+#include "npu_parser.h"
+#include "npu_types.h"
+
+namespace npu {
+
+class TaskSpec {
+ public:
+  virtual bool IsFunctionOp() const = 0;
+  bool ShouldFallback() const { return !fallback_reason_.empty(); };
+  std::string FallbackReason() const { return fallback_reason_; };
+  std::string Op() const { return ndef_.op(); }
+  virtual std::string DebugString() const = 0;
+  tensorflow::NodeDef NodeDef() const { return ndef_; }
+  const TensorDataTypes &InputTypes() const { return input_dtypes_; }
+  const TensorShapes &InputShapes() const { return input_shapes_; }
+  const TensorDataTypes &OutputTypes() const { return output_dtypes_; }
+  virtual const tensorflow::OpRegistrationData *OpRegistrationData() const { return op_spec_; }
+
+ protected:
+  TaskSpec() : op_spec_(nullptr){};
+  ~TaskSpec() = default;
+  const tensorflow::OpRegistrationData *op_spec_;  // 算子IR注册的信息，非实例
+  tensorflow::NodeDef ndef_;                       // 节点的NodeDef，主要存储实例化属性信息
+  TensorDataTypes input_dtypes_;
+  TensorShapes input_shapes_;
+  TensorDataTypes output_dtypes_;
+  std::string fallback_reason_;
+};
+
+class OpSpec : public TaskSpec {
+ public:
+  OpSpec(const tensorflow::OpRegistrationData *op_spec, tensorflow::NodeDef ndef, TensorShapes input_shapes,
+         TensorPartialShapes output_shapes, std::string reason)
+      : always_infer_shape_(false), partial_output_shapes_(output_shapes) {
+    TensorDataTypes input_dtypes;
+    TensorDataTypes output_dtypes;
+    tensorflow::InOutTypesForNode(ndef, op_spec->op_def, &input_dtypes, &output_dtypes);
+    op_spec_ = op_spec;
+    ndef_ = std::move(ndef);
+    input_dtypes_ = std::move(input_dtypes);
+    input_shapes_ = std::move(input_shapes);
+    output_dtypes_ = std::move(output_dtypes);
+
+    fallback_reason_ = std::move(reason);
+    if (ShouldFallback()) { return; }
+    TensorShapes shapes;
+    shapes.resize(output_shapes.size());
+    for (size_t i = 0; i < output_shapes.size(); i++) {
+      // 如果不是函数算子，那么必须要求inferShape输出确定的结果
+      if (!output_shapes[i].AsTensorShape(&shapes[i])) {
+        fallback_reason_ = tensorflow::strings::StrCat("output", i, " unknown shape ", output_shapes[i].DebugString());
+        break;
+      }
+    }
+
+    if (!ShouldFallback()) {
+      output_shapes_ = shapes;
+      AssembleInputDesc(input_shapes_, input_dtypes_, &attached_attrs_);
+      AssembleOutputDesc(output_shapes_, output_dtypes_, &attached_attrs_);
+    }
+  }
+
+  OpSpec(const tensorflow::OpRegistrationData *op_spec, tensorflow::NodeDef ndef, TensorShapes input_shapes,
+         std::string reason)
+      : always_infer_shape_(true) {
+    TensorDataTypes input_dtypes;
+    TensorDataTypes output_dtypes;
+    tensorflow::InOutTypesForNode(ndef, op_spec->op_def, &input_dtypes, &output_dtypes);
+
+    op_spec_ = op_spec;
+    ndef_ = std::move(ndef);
+    input_dtypes_ = std::move(input_dtypes);
+    input_shapes_ = std::move(input_shapes);
+    output_dtypes_ = std::move(output_dtypes);
+    fallback_reason_ = std::move(reason);
+
+    if (!ShouldFallback()) { AssembleInputDesc(input_shapes_, input_dtypes_, &attached_attrs_); }
+  }
+
+  ~OpSpec() = default;
+  bool IsFunctionOp() const override { return false; }
+  bool ShouldInferShape() const { return always_infer_shape_; }
+  const TensorShapes &OutputShapes() const { return output_shapes_; }
+  const TensorPartialShapes &OutputPartialShapes() const { return partial_output_shapes_; }
+  tensorflow::NodeDef ParserNodeDef() const {
+    tensorflow::NodeDef ndef;
+    ndef.MergeFrom(ndef_);
+    ndef.MergeFrom(attached_attrs_);
+    return ndef;
+  }
+  std::string DebugString() const override {
+    std::stringstream ss;
+    ss << NodeDef().DebugString() << std::endl;
+    ss << attached_attrs_.DebugString() << std::endl;
+    ss << OpRegistrationData()->op_def.DebugString() << std::endl;
+    for (size_t i = 0; i < output_dtypes_.size(); i++) {
+      if (always_infer_shape_ || ShouldFallback()) {
+        ss << "output " << i << " " << tensorflow::DataTypeString(output_dtypes_[i]) << " <need re-infer>" << std::endl;
+      } else {
+        ss << "output " << i << " " << tensorflow::DataTypeString(output_dtypes_[i]) << " "
+           << partial_output_shapes_[i].DebugString() << std::endl;
+      }
+    }
+    if (ShouldFallback()) { ss << "Fallback reason " << fallback_reason_; }
+    return ss.str();
+  }
+
+ private:
+  bool always_infer_shape_;
+  TensorShapes output_shapes_;
+  TensorPartialShapes partial_output_shapes_;
+  tensorflow::NodeDef attached_attrs_;
+};
+
+class FuncSpec : public TaskSpec {
+  using TensorDataTypes = tensorflow::gtl::InlinedVector<tensorflow::DataType, 4>;
+
+ public:
+  using PruneInputsFunc =
+      std::function<void(int num_inputs, TFE_TensorHandle **inputs, std::vector<TFE_TensorHandle *> &)>;
+  FuncSpec(const tensorflow::OpRegistrationData *op_spec, tensorflow::NodeDef ndef, uint64_t ge_graph_id,
+           std::unique_ptr<const tensorflow::Graph> graph, PruneInputsFunc prune_func,
+           std::vector<tensorflow::ResourceHandle> dependent_host_resources, std::string reason = "")
+      : ge_graph_id_(ge_graph_id), graph_(std::move(graph)), prune_func_(std::move(prune_func)),
+        dependent_host_resources_(std::move(dependent_host_resources)) {
+
+    TensorDataTypes input_dtypes;
+    TensorDataTypes output_dtypes;
+    tensorflow::InOutTypesForNode(ndef, op_spec->op_def, &input_dtypes, &output_dtypes);
+
+    op_spec_ = op_spec;
+    ndef_ = std::move(ndef);
+    input_dtypes_ = std::move(input_dtypes);
+    output_dtypes_ = std::move(output_dtypes);
+    fallback_reason_ = std::move(reason);
+  }
+  ~FuncSpec() = default;
+  bool IsFunctionOp() const override { return true; }
+
+  uint64_t GeGraphId() const { return ge_graph_id_; }
+
+  const std::vector<tensorflow::ResourceHandle>& DependentHostResources() const { return dependent_host_resources_; }
+
+  const tensorflow::Graph *Graph() const { return graph_.get(); }
+
+  void PruneInputs(int num_inputs, TFE_TensorHandle **inputs, std::vector<TFE_TensorHandle *> &pruned) const {
+    prune_func_(num_inputs, inputs, pruned);
+  }
+  std::string DebugString() const override {
+    std::stringstream ss;
+    ss << NodeDef().DebugString() << std::endl;
+    ss << OpRegistrationData()->op_def.DebugString() << std::endl;
+    ss << "Ge graph id " << ge_graph_id_ << std::endl;
+    for (size_t i = 0; i < output_dtypes_.size(); i++) {
+      ss << "output " << i << " " << tensorflow::DataTypeString(output_dtypes_[i]) << std::endl;
+    }
+    if (ShouldFallback()) { ss << "Fallback reason " << fallback_reason_; }
+    return ss.str();
+  }
+
+ private:
+  uint64_t ge_graph_id_;
+  std::unique_ptr<const tensorflow::Graph> graph_;
+  PruneInputsFunc prune_func_;
+  const std::vector<tensorflow::ResourceHandle> dependent_host_resources_;
+};
+}  // namespace npu
+
+#endif  //TENSORFLOW_NPU_CACHE_SPEC_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_custom_kernel.h b/tf_adapter_2.x/npu_device/core/npu_custom_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..49aed1b99f40adc24526b41ccb97ac5ca0b4ecbf
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_custom_kernel.h
@@ -0,0 +1,122 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_CUSTOM_KERNEL_H
+#define TENSORFLOW_NPU_CUSTOM_KERNEL_H
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+
+#include "npu_device.h"
+#include "npu_logger.h"
+#include "npu_micros.h"
+#include "npu_parser.h"
+#include "npu_unwrap.h"
+#include "npu_utils.h"
+
+using NpuCustomKernelFunc =
+    std::function<void(TFE_Context *, NpuDevice *, const npu::OpSpec *, const TensorShapes &,
+                       const tensorflow::NodeDef &, int, TFE_TensorHandle **, int, TFE_TensorHandle **, TF_Status *)>;
+
+using NpuFallbackHookFunc = std::function<void(TFE_Context *, NpuDevice *, const char *, const TFE_OpAttrs *, int,
+                                               TFE_TensorHandle **, int, TFE_TensorHandle **, TF_Status *)>;
+
+class CustomKernelRegistry {
+ public:
+  static CustomKernelRegistry &Instance() {
+    static CustomKernelRegistry inst;
+    return inst;
+  }
+  void Register(const std::string &op_name, const NpuCustomKernelFunc &func) {
+    std::lock_guard<std::mutex> lk(mu_);
+    DCHECK(specific_kernels_.find(op_name) == specific_kernels_.end());
+    specific_kernels_.emplace(std::make_pair(op_name, func));
+  }
+  void RegisterHook(const std::string &op_name, const NpuFallbackHookFunc &func) {
+    std::lock_guard<std::mutex> lk(mu_);
+    DCHECK(specific_kernels_.find(op_name) == specific_kernels_.end());
+    specific_hooks_.emplace(std::make_pair(op_name, func));
+  }
+
+  bool GetCustomKernelFunc(const std::string &op_name, NpuCustomKernelFunc **func) {
+    DLOG() << "NPU Looking up custom kernel for " << op_name;
+    std::lock_guard<std::mutex> lk(mu_);
+    if (specific_kernels_.find(op_name) == specific_kernels_.end()) {
+      DLOG() << "NPU Looking up kernel not found for op " << op_name;
+      return false;
+    }
+    *func = &specific_kernels_[op_name];
+    return true;
+  }
+
+  bool GetFallbackHookFunc(const std::string &op_name, NpuFallbackHookFunc **func) {
+    DLOG() << "NPU Looking up callback hook for " << op_name;
+    std::lock_guard<std::mutex> lk(mu_);
+    if (specific_hooks_.find(op_name) == specific_hooks_.end()) {
+      DLOG() << "NPU Callback hook not found for op " << op_name;
+      return false;
+    }
+    *func = &specific_hooks_[op_name];
+    return true;
+  }
+
+ private:
+  CustomKernelRegistry() = default;
+  std::mutex mu_;
+  std::map<std::string, NpuCustomKernelFunc> specific_kernels_;
+  std::map<std::string, NpuFallbackHookFunc> specific_hooks_;
+};
+
+class CustomKernelSpec {
+ public:
+  CustomKernelSpec(std::string name, NpuCustomKernelFunc custom_func)
+      : op(std::move(name)), func(std::move(custom_func)) {}
+  std::string op;
+  NpuCustomKernelFunc func;
+};
+
+class FallbackHookSpec {
+ public:
+  FallbackHookSpec(std::string name, NpuFallbackHookFunc custom_func)
+      : op(std::move(name)), func(std::move(custom_func)) {}
+  std::string op;
+  NpuFallbackHookFunc func;
+};
+
+class CustomKernelReceiver {
+ public:
+  CustomKernelReceiver(const CustomKernelSpec &spec) {  // NOLINT(google-explicit-constructor)
+    DLOG() << "NPU Register custom kernel for " << spec.op;
+    CustomKernelRegistry::Instance().Register(spec.op, spec.func);
+  }
+
+  CustomKernelReceiver(const FallbackHookSpec &spec) {  // NOLINT(google-explicit-constructor)
+    DLOG() << "NPU Register fallback hook for " << spec.op;
+    CustomKernelRegistry::Instance().RegisterHook(spec.op, spec.func);
+  }
+};
+
+#define NPU_REGISTER_CUSTOM_KERNEL(name, func) NPU_REGISTER_CUSTOM_KERNEL_1(__COUNTER__, name, func)
+#define NPU_REGISTER_CUSTOM_KERNEL_1(ctr, name, func) NPU_REGISTER_CUSTOM_KERNEL_2(ctr, name, func)
+#define NPU_REGISTER_CUSTOM_KERNEL_2(ctr, name, func)                                                                  \
+  static CustomKernelReceiver __preserved_op##ctr = CustomKernelSpec(name, func)
+
+#define NPU_REGISTER_FALLBACK_HOOK(name, func) NPU_REGISTER_FALLBACK_HOOK_1(__COUNTER__, name, func)
+#define NPU_REGISTER_FALLBACK_HOOK_1(ctr, name, func) NPU_REGISTER_FALLBACK_HOOK_2(ctr, name, func)
+#define NPU_REGISTER_FALLBACK_HOOK_2(ctr, name, func)                                                                  \
+  static CustomKernelReceiver __preserved_op##ctr = FallbackHookSpec(name, func)
+
+#endif  //TENSORFLOW_NPU_CUSTOM_KERNEL_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_device.cpp b/tf_adapter_2.x/npu_device/core/npu_device.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76206b16e854c393b335be57dc003a8748c91191
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_device.cpp
@@ -0,0 +1,1625 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+
+#include "npu_custom_kernel.h"
+#include "npu_device.h"
+#include "npu_dp.h"
+#include "npu_env.h"
+#include "npu_logger.h"
+#include "npu_micros.h"
+#include "npu_parser.h"
+#include "npu_unwrap.h"
+#include "npu_utils.h"
+
+#include "framework/common/ge_inner_error_codes.h"
+#include "framework/omg/parser/model_parser.h"
+#include "framework/omg/parser/parser_factory.h"
+
+using Format = ge::Format;
+
+namespace {
+template<typename T>
+class NpuHostFixedAllocator : public tensorflow::Allocator {
+ public:
+  static tensorflow::Allocator *Create(std::unique_ptr<T> ptr) {
+    return new (std::nothrow) NpuHostFixedAllocator(std::move(ptr));
+  }
+
+ private:
+  explicit NpuHostFixedAllocator(std::unique_ptr<T> ptr) : ptr_(std::move(ptr)) {
+    DLOG() << "Zero copied ge tensor " << reinterpret_cast<uintptr_t>(ptr_.get());
+  }
+  ~NpuHostFixedAllocator() override {
+    DLOG() << "Release zero copied ge tensor " << reinterpret_cast<uintptr_t>(ptr_.get());
+  };
+  std::string Name() override { return "NpuHostFixedAllocator"; }
+  void *AllocateRaw(size_t alignment, size_t num_bytes) override { return ptr_.get(); }
+  void DeallocateRaw(void *ptr) override { delete this; }
+  std::unique_ptr<T> ptr_;
+};
+}  // namespace
+
+tensorflow::Status NpuDevice::ConsumeIteratorAsync(const tensorflow::ResourceHandle &resource, int64_t nums,
+                                                   const DoneCallback &done) {
+  auto iter = iterator_providers_.find(resource);
+  if (iter == iterator_providers_.end()) {
+    return tensorflow::errors::Internal("Iterator resource provider not found for resource ", resource.name());
+  }
+  auto provider = iter->second;
+  return provider->Consume(nums, done);
+}
+
+tensorflow::Status NpuDevice::ConsumeIteratorSync(const tensorflow::ResourceHandle &resource, int64_t nums) {
+  tensorflow::Notification done;
+  auto status = tensorflow::Status::OK();
+  ConsumeIteratorAsync(resource, nums, [&status, &done](tensorflow::Status s) {
+    status = std::move(s);
+    done.Notify();
+  });
+  done.WaitForNotification();
+  return status;
+}
+
+void NpuDevice::CreateIteratorProvider(TFE_Context *context, const tensorflow::Tensor *tensor,
+                                       std::vector<int> device_ids, TF_Status *status) {
+  auto resource = tensor->scalar<tensorflow::ResourceHandle>()();
+  TensorPartialShapes shapes;
+  TensorDataTypes types;
+  NPU_CTX_REQUIRES_OK(status, GetMirroredIteratorShapesAndTypes(resource, shapes, types));
+  auto dp_provider =
+      IteratorResourceProvider::GetFunctionDef(resource.name(), std::move(device_ids), shapes, types, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef();
+  NPU_CTX_REQUIRES_OK(status, lib_def->AddFunctionDef(dp_provider));
+  tensorflow::ProcessFunctionLibraryRuntime *pflr = npu::UnwrapCtx(context)->pflr();
+  tensorflow::FunctionLibraryRuntime *flr = pflr->GetFLR(underlying_device);
+  tensorflow::FunctionLibraryRuntime::Handle f_handle;
+  NPU_CTX_REQUIRES_OK(status, flr->Instantiate(dp_provider.signature().name(), tensorflow::AttrSlice{}, &f_handle));
+
+  tensorflow::Tensor captured_tensor = *tensor;
+  auto consume_func = [flr, f_handle, captured_tensor]() -> tensorflow::Status {
+    std::vector<tensorflow::Tensor> get_next_outputs;
+    return flr->RunSync(tensorflow::FunctionLibraryRuntime::Options{}, f_handle, {captured_tensor}, &get_next_outputs);
+  };
+  auto destroy_func = [resource, flr, f_handle]() -> tensorflow::Status {
+    LOG(INFO) << "Stopping iterator resource provider for " << resource.name();
+    return flr->ReleaseHandle(f_handle);
+  };
+
+  auto provider = std::make_shared<IteratorResourceProvider>(resource.name(), consume_func, destroy_func);
+  LOG(INFO) << "Iterator resource provider for " << resource.name() << " created";
+
+  NPU_CTX_REQUIRES(status, provider != nullptr,
+                   tensorflow::errors::Internal("Failed create iterator reosurce provider for ", resource.name()));
+
+  iterator_providers_[resource] = provider;
+
+  if (kDumpExecutionDetail || kDumpGraph) {
+    std::unique_ptr<tensorflow::FunctionBody> fbody;
+    tensorflow::AttrSlice attr_slice;
+    tensorflow::FunctionDefToBodyHelper(dp_provider, attr_slice, lib_def, &fbody);
+    std::string file_name = "dp_provider_" + resource.name() + ".pbtxt";
+    WriteTextProto(tensorflow::Env::Default(), file_name, fbody->graph->ToGraphDefDebug());
+  }
+}
+
+std::string NpuDevice::CreateDevice(const char *name, int device_index,
+                                    const std::map<std::string, std::string> &session_options, NpuDevice **device) {
+  auto *ge_session = new (std::nothrow) ge::Session(session_options);
+  if (ge_session == nullptr) { return "Failed init graph engine: create new session failed"; }
+
+  std::shared_ptr<domi::ModelParser> parser =
+      domi::ModelParserFactory::Instance()->CreateModelParser(domi::FrameworkType::TENSORFLOW);
+  if (parser == nullptr) { return "Failed init graph engine: create tensorflow model parser failed"; }
+
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(TF_NewStatus(), TF_DeleteStatus);
+
+  *device = new (std::nothrow) NpuDevice();
+  if (*device == nullptr) { return "Failed create new npu device instance"; }
+  (*device)->device_id = device_index;
+  (*device)->device_name = name;
+  (*device)->underlying_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  (*device)->ge_session_ = ge_session;
+  return "";
+}
+
+void NpuDevice::ReleaseResource() {
+  for (auto &iterator_provider : iterator_providers_) { iterator_provider.second->Destroy(); }
+}
+
+void NpuDevice::DeleteDevice(void *device) {
+  DLOG() << "Start destroy npu device instance";
+  if (device == nullptr) { return; }
+  auto npu_device = reinterpret_cast<NpuDevice *>(device);
+  delete npu_device->ge_session_;
+  delete npu_device;
+}
+
+tensorflow::Status NpuDevice::ValidateResourcePlacement(const char *op_name, int num_inputs, TFE_TensorHandle **inputs,
+                                                        bool &cpu_resource) {
+  bool has_cpu = false;
+  int cpu_index = 0;
+  bool has_npu = false;
+  int npu_index = 0;
+  for (int i = 0; i < num_inputs; i++) {
+    auto data_type = npu::UnwrapHandle(inputs[i])->DataType();
+    if (data_type == tensorflow::DT_RESOURCE) {
+      const tensorflow::Tensor *tensor;
+      (void) npu::UnwrapTensor(inputs[i], &tensor);
+      if (IsNpuTensorHandle(npu::UnwrapHandle(inputs[i]))) {
+        has_npu = true;
+        npu_index = i;
+        if (has_cpu) {
+          const tensorflow::Tensor *cpu_tensor;
+          (void) npu::UnwrapTensor(inputs[cpu_index], &cpu_tensor);
+          return tensorflow::errors::InvalidArgument(
+              op_name, " resource input ", i, " ", tensor->scalar<tensorflow::ResourceHandle>()().name(),
+              " on NPU but resource input ", cpu_index, " ", cpu_tensor->scalar<tensorflow::ResourceHandle>()().name(),
+              " on CPU");
+        }
+      } else if (!Mirrored(tensor->scalar<tensorflow::ResourceHandle>()())) {
+        has_cpu = true;
+        cpu_index = i;
+        if (has_npu) {
+          const tensorflow::Tensor *npu_tensor;
+          (void) npu::UnwrapTensor(inputs[npu_index], &npu_tensor);
+          return tensorflow::errors::InvalidArgument(
+              op_name, " resource input ", i, " ", tensor->scalar<tensorflow::ResourceHandle>()().name(),
+              " on CPU but resource input ", npu_index, " ", npu_tensor->scalar<tensorflow::ResourceHandle>()().name(),
+              " on NPU");
+        }
+      }
+    }
+  }
+  cpu_resource = has_cpu;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status NpuDevice::ValidateInput(const char *op_name, int num_inputs, TFE_TensorHandle **inputs) {
+  for (int i = 0; i < num_inputs; i++) {
+    auto data_type = npu::UnwrapHandle(inputs[i])->DataType();
+    if (data_type == tensorflow::DT_RESOURCE) {
+      const tensorflow::Tensor *tensor;
+      NPU_REQUIRES_OK(npu::UnwrapTensor(inputs[i], &tensor));
+      if (!IsNpuTensorHandle(npu::UnwrapHandle(inputs[i]))) {
+        if (!Mirrored(tensor->scalar<tensorflow::ResourceHandle>()())) {
+          tensorflow::Status status;
+          std::string src_name = npu::UnwrapHandle(inputs[i])->DeviceName(&status);
+          if (!status.ok()) { src_name = status.ToString(); }
+          return tensorflow::errors::Unimplemented("Op ", op_name, " input ", i, " resource from ", src_name);
+        } else {
+          DLOG() << "Op" << op_name << " input " << i << " resource mirrored from "
+                 << tensor->scalar<tensorflow::ResourceHandle>()().DebugString();
+        }
+      }
+    } else if (!tensorflow::DataTypeCanUseMemcpy(data_type)) {
+      return tensorflow::errors::Unimplemented("Op ", op_name, " input ", i, " unsupported type ",
+                                               tensorflow::DataTypeString(data_type));
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status NpuDevice::ValidateOutput(const char *op_name, const TensorDataTypes &data_types) {
+  for (size_t i = 0; i < data_types.size(); i++) {
+    auto data_type = data_types[i];
+    if (data_type == tensorflow::DT_RESOURCE) {
+      if (!SupportedResourceGenerator(op_name)) {
+        return tensorflow::errors::Unimplemented("Op ", op_name, " unsupported resource generator by NPU");
+      }
+    } else if (!tensorflow::DataTypeCanUseMemcpy(data_type)) {
+      return tensorflow::errors::Unimplemented("Op ", op_name, " output ", i, " unsupported type ",
+                                               tensorflow::DataTypeString(data_type));
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+void NpuDevice::PruneFunction(const tensorflow::FunctionDef &fdef, tensorflow::Graph *g, bool keep_signature) {
+  std::unordered_set<tensorflow::StringPiece, tensorflow::StringPieceHasher> control_ret_nodes;
+  for (const auto &control_ret : fdef.control_ret()) { control_ret_nodes.insert(control_ret.second); }
+
+  std::unordered_set<const tensorflow::Node *> nodes;
+  for (auto n : g->nodes()) {
+    if (n->IsControlFlow() || n->op_def().is_stateful()
+        || (control_ret_nodes.find(n->name()) != control_ret_nodes.end())) {
+      if (n->type_string() == "VarHandleOp" || n->type_string() == "IteratorV2") { continue; }
+      if (!keep_signature) {
+        if (n->IsArg()) { continue; }
+        if (n->IsRetval() && n->attrs().Find("T")->type() == tensorflow::DT_RESOURCE) { continue; }
+      }
+      nodes.insert(n);
+    }
+  }
+  bool changed = PruneForReverseReachability(g, std::move(nodes));
+  if (changed) { FixupSourceAndSinkEdges(g); }
+}
+
+void NpuDevice::FixGraphArgRetvalIndex(tensorflow::Graph *graph) {
+  std::map<int, tensorflow::Node *> indexed_args;
+  std::map<int, tensorflow::Node *> indexed_retvals;
+  for (auto node : graph->nodes()) {
+    if (node->IsArg()) { indexed_args[node->attrs().Find("index")->i()] = node; }
+    if (node->IsRetval()) { indexed_retvals[node->attrs().Find("index")->i()] = node; }
+  }
+  int current_arg_index = 0;
+  for (auto indexed_arg : indexed_args) { indexed_arg.second->AddAttr("index", current_arg_index++); }
+
+  int current_retval_index = 0;
+  for (auto indexed_retval : indexed_retvals) { indexed_retval.second->AddAttr("index", current_retval_index++); }
+}
+
+tensorflow::Status
+NpuDevice::TransResourceInput2GraphNode(TFE_Context *context, tensorflow::Graph *graph, int num_inputs,
+                                        TFE_TensorHandle **inputs,
+                                        std::vector<tensorflow::ResourceHandle> &dependent_host_resources) {
+  std::set<int> arg_is_variable;
+  std::set<int> arg_is_iterator;
+
+  std::map<int, tensorflow::ResourceHandle> arg_resource_handles;
+
+  VecTensorDataTypes arg_handle_dtyes(num_inputs);
+  VecTensorPartialShapes arg_handle_shapes(num_inputs);
+
+  for (int i = 0; i < num_inputs; i++) {
+    if (inputs[i] == nullptr) { continue; };
+    const tensorflow::Tensor *tensor;
+    NPU_REQUIRES_OK(npu::UnwrapTensor(inputs[i], &tensor));
+    if (tensor->dtype() == tensorflow::DT_RESOURCE) {
+      auto handle = tensor->flat<tensorflow::ResourceHandle>()(0);
+      arg_resource_handles[i] = handle;
+      if (MirroredIterator(handle)) {
+        GetMirroredIteratorShapesAndTypes(handle, arg_handle_shapes[i], arg_handle_dtyes[i]);
+        arg_is_iterator.insert(i);
+      } else {
+        const auto &dtypes_and_shapes = handle.dtypes_and_shapes();
+        for (auto &dtype_and_shape : dtypes_and_shapes) {
+          arg_handle_dtyes[i].push_back(dtype_and_shape.dtype);
+          arg_handle_shapes[i].push_back(dtype_and_shape.shape);
+        }
+        arg_is_variable.insert(i);
+      }
+    }
+  }
+
+  std::map<tensorflow::Node *, tensorflow::Node *> arg_substitutes;
+  for (auto node : graph->op_nodes()) {
+    if (node->IsArg()) {
+      auto index = node->attrs().Find("index")->i();
+      if (arg_is_iterator.count(index)) {
+        NPU_REQUIRES_OK(tensorflow::NodeBuilder(WrapResourceName(arg_resource_handles[index].name()), "IteratorV2")
+                            .Attr("container", arg_resource_handles[index].container())
+                            .Attr("shared_name", arg_resource_handles[index].name())
+                            .Attr("output_types", arg_handle_dtyes[index])
+                            .Attr("output_shapes", arg_handle_shapes[index])
+                            .Attr("_arg_name", node->name())
+                            .Attr("_arg_index", int(index))
+                            .Finalize(graph, &arg_substitutes[node]));
+
+      } else if (arg_is_variable.count(index)) {
+        tensorflow::Node *variable = nullptr;
+        NPU_REQUIRES_OK(tensorflow::NodeBuilder(WrapResourceName(arg_resource_handles[index].name()), "VarHandleOp")
+                            .Attr("container", arg_resource_handles[index].container())
+                            .Attr("shared_name", arg_resource_handles[index].name())
+                            .Attr("dtype", arg_handle_dtyes[index][0])
+                            .Attr("shape", arg_handle_shapes[index][0])
+                            .Attr("_arg_name", node->name())
+                            .Attr("_arg_index", int(index))
+                            .Finalize(graph, &arg_substitutes[node]));
+      }
+    }
+  }
+
+  // 这里需要把涉及的function的resource输入也一并替换了
+  std::vector<tensorflow::Node *> nodes_to_remove;
+  std::vector<tensorflow::Node *> control_flow_nodes;
+  std::set<tensorflow::ResourceHandle, ResourceCompare> unique_dependent_resources;
+  for (auto node : graph->op_nodes()) {
+    if (node->IsRetval() && node->input_type(0) == tensorflow::DT_RESOURCE) {
+      nodes_to_remove.push_back(node);
+      continue;
+    }
+    if (node->IsIfNode() || node->IsCaseNode() || node->IsWhileNode() || node->IsFunctionCall()) {
+      std::string func_input_name = node->IsFunctionCall() ? "args" : "input";
+      bool need_trans_resource = false;
+      for (auto edge : node->in_edges()) {
+        if (edge->src()->IsArg() && arg_substitutes.find(edge->src()) != arg_substitutes.end()) {
+          need_trans_resource = true;
+        }
+      }
+      if (!need_trans_resource) continue;
+
+      control_flow_nodes.push_back(node);
+
+      tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef();
+      const tensorflow::OpRegistrationData *op_reg_data;
+      NPU_REQUIRES_OK(lib_def->LookUp(node->type_string(), &op_reg_data));
+      int func_input_start = 0;
+      int func_input_end = 0;
+      for (const auto &in_arg : op_reg_data->op_def.input_arg()) {
+        func_input_start = func_input_end;
+        if (in_arg.type_list_attr().empty()) {
+          func_input_end++;
+        } else {
+          func_input_end += node->attrs().Find(in_arg.type_list_attr())->list().type_size();
+        }
+        DLOG() << node->name() << " input arg " << in_arg.name() << " range [" << func_input_start << ", "
+               << func_input_end << ")";
+        if (in_arg.name() == func_input_name) { break; }
+      }
+
+      std::vector<TFE_TensorHandle *> func_inputs;
+      for (int i = func_input_start; i < func_input_end; i++) {
+        const tensorflow::Edge *edge;
+        NPU_REQUIRES_OK(node->input_edge(i, &edge));
+        if (edge->src()->IsArg() && arg_substitutes.find(edge->src()) != arg_substitutes.end()) {
+          func_inputs.push_back(inputs[edge->src()->attrs().Find("index")->i()]);
+        } else {
+          func_inputs.push_back(nullptr);
+        }
+      }
+
+      for (auto &attr : node->attrs()) {
+        if (attr.second.has_func()) {
+          static std::atomic<uint64_t> uuid{0};
+          std::string func_name = node->type_string() + "_" + attr.first + "_" + attr.second.func().name() + "_"
+              + std::to_string(uuid.fetch_add(1));
+          const tensorflow::FunctionDef *fdef = lib_def->Find(attr.second.func().name());
+          std::unique_ptr<tensorflow::FunctionBody> fbody;
+          FunctionDefToBodyHelper(*fdef, tensorflow::AttrSlice{}, lib_def, &fbody);
+          std::vector<tensorflow::ResourceHandle> unused_host_resources;
+          TransResourceInput2GraphNode(context, fbody->graph, func_inputs.size(), func_inputs.data(),
+                                       unused_host_resources);
+
+          // Arg节点可能会被优化掉，因而需要重新排列index
+          std::vector<int> remain_indexes;
+          for (auto n : fbody->graph->nodes()) {
+            if (n->IsArg()) { remain_indexes.push_back(n->attrs().Find("index")->i()); }
+          }
+          FixGraphArgRetvalIndex(fbody->graph);
+          DLOG() << func_name << " remained input index (0-" << func_inputs.size() - 1 << ") -> "
+                 << VecToString(remain_indexes);
+
+          tensorflow::FunctionDef optimized_fdef;
+          auto lookup = [&fdef](const tensorflow::Node *node) -> absl::optional<std::string> {
+            for (const auto &control_ret : fdef->control_ret()) {
+              if (control_ret.second == node->name()) { return absl::make_optional(node->name()); }
+            }
+            return absl::nullopt;
+          };
+          NPU_REQUIRES_OK(tensorflow::GraphToFunctionDef(*fbody->graph, func_name, lookup, &optimized_fdef));
+          NPU_REQUIRES_OK(lib_def->AddFunctionDef(optimized_fdef));
+          DLOG() << "Change " << node->name() << " attr " << attr.first << " func name " << attr.second.func().name()
+                 << " to " << func_name;
+          const_cast<tensorflow::AttrValue *>(node->attrs().Find(attr.first))->mutable_func()->set_name(func_name);
+        }
+      }
+    }
+
+    std::vector<const tensorflow::Edge *> edges;
+    for (auto edge : node->in_edges()) { edges.emplace_back(edge); }  // You can never modify and iterator an EdgeSet
+    for (auto edge : edges) {
+      if (edge->src()->IsArg()) {
+        auto iter = arg_substitutes.find(edge->src());
+        if (iter != arg_substitutes.end()) {
+          int index = edge->src()->attrs().Find("index")->i();
+          if (arg_is_iterator.count(index)) { unique_dependent_resources.insert(arg_resource_handles[index]); }
+          graph->AddEdge(iter->second, 0, node, edge->dst_input());
+          graph->RemoveEdge(edge);
+        }
+      }
+    }
+  }
+
+  for (const auto &resource : unique_dependent_resources) { dependent_host_resources.push_back(resource); }
+
+  for (auto node : control_flow_nodes) {
+    if (node->IsWhileNode() || node->IsCaseNode() || node->IsWhileNode() || node->IsFunctionCall()) {
+      tensorflow::NodeDef ndef = node->def();
+      if (node->IsWhileNode()) {
+        int removed_nums = 0;
+        for (int i = 0; i < node->num_inputs(); i++) {
+          if (node->input_type(i) == tensorflow::DT_RESOURCE) {
+            int index = i - removed_nums;
+            removed_nums++;
+
+            ndef.mutable_input()->erase(ndef.mutable_input()->begin() + index);
+
+            auto type = ndef.mutable_attr()->at("T").mutable_list()->mutable_type();
+            type->erase(type->begin() + index);
+
+            auto shape = ndef.mutable_attr()->at("output_shapes").mutable_list()->mutable_shape();
+            shape->erase(shape->begin() + index);
+          }
+        }
+      } else if (node->IsCaseNode() || node->IsWhileNode() || node->IsFunctionCall()) {
+        int removed_nums = 0;
+        int arg_start_index = node->IsFunctionCall() ? 0 : 1;
+        for (int i = arg_start_index; i < node->num_inputs(); i++) {
+          if (node->input_type(i) == tensorflow::DT_RESOURCE) {
+            int index = i - removed_nums;
+            removed_nums++;
+
+            ndef.mutable_input()->erase(ndef.mutable_input()->begin() + index);
+
+            auto type = ndef.mutable_attr()->at("Tin").mutable_list()->mutable_type();
+            type->erase(type->begin() + index - arg_start_index);
+          }
+        }
+      }
+      DLOG() << "Pruned control flow op " << ndef.DebugString();
+      tensorflow::Status status;
+      auto pruned_node = graph->AddNode(ndef, &status);
+      NPU_REQUIRES_OK(status);
+      int pruned_input_index = 0;
+      for (int i = 0; i < node->num_inputs(); i++) {
+        const tensorflow::Edge *edge;
+        NPU_REQUIRES_OK(node->input_edge(i, &edge));
+        if (node->input_type(i) != tensorflow::DT_RESOURCE) {
+          graph->AddEdge(edge->src(), edge->src_output(), pruned_node, pruned_input_index++);
+        }
+      }
+      for (auto n : graph->op_nodes()) {
+        for (auto edge : n->in_edges()) {
+          if (edge->src() == node) { graph->AddEdge(pruned_node, edge->src_output(), edge->dst(), edge->dst_input()); }
+        }
+      }
+      graph->RemoveNode(node);
+    }
+  }
+  for (auto node : nodes_to_remove) { graph->RemoveNode(node); }
+  for (auto arg_substitute : arg_substitutes) { graph->RemoveNode(arg_substitute.first); }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status NpuDevice::MarkGraphNodeInOutDesc(TFE_Context *context, tensorflow::Graph *graph, int num_inputs,
+                                                     TFE_TensorHandle **inputs) {
+
+  tensorflow::ShapeRefiner shape_refiner(graph->versions(), npu::UnwrapCtx(context)->FuncLibDef());
+  VecTensorShapes arg_shapes;
+  VecTensorDataTypes arg_handle_dtyes;
+  VecTensorPartialShapes arg_handle_shapes;
+  for (int i = 0; i < num_inputs; i++) {
+    const tensorflow::Tensor *tensor;
+    NPU_REQUIRES_OK(npu::UnwrapTensor(inputs[i], &tensor));
+    arg_shapes.push_back({tensor->shape()});
+    TensorDataTypes handle_dtyes;
+    TensorPartialShapes handle_shapes;
+    if (tensor->dtype() == tensorflow::DT_RESOURCE) {
+      auto handle = tensor->flat<tensorflow::ResourceHandle>()(0);
+      const auto &dtypes_and_shapes = handle.dtypes_and_shapes();
+      for (auto &dtype_and_shape : dtypes_and_shapes) {
+        handle_dtyes.push_back(dtype_and_shape.dtype);
+        handle_shapes.push_back(dtype_and_shape.shape);
+      }
+    }
+    arg_handle_dtyes.push_back(handle_dtyes);
+    arg_handle_shapes.push_back(handle_shapes);
+  }
+
+  auto node_shape_inference_lambda = [&shape_refiner, num_inputs, inputs, &arg_shapes, &arg_handle_dtyes,
+                                      &arg_handle_shapes](tensorflow::Node *node) {
+    AssembleOpDef(node);
+    if (node->IsArg() && node->attrs().Find("index")) {
+      auto index = node->attrs().Find("index")->i();
+      if (index < num_inputs && !node->attrs().Find("_output_shapes")) {
+        node->AddAttr("_output_shapes", arg_shapes[index]);
+      }
+      if (index < num_inputs && npu::UnwrapHandle(inputs[index])->DataType() == tensorflow::DT_RESOURCE) {
+        if (!node->attrs().Find("_handle_shapes")) { node->AddAttr("_handle_shapes", arg_handle_shapes[index]); }
+        if (!node->attrs().Find("_handle_dtypes")) { node->AddAttr("_handle_dtypes", arg_handle_dtyes[index]); }
+      }
+    }
+    auto status = shape_refiner.AddNode(node);
+    if (!status.ok()) {
+      LOG(INFO) << "  " << node->name() << "[" << node->type_string() << "] Skip infer " << status.error_message();
+      return;
+    }
+    auto node_ctx = shape_refiner.GetContext(node);
+
+    DLOG() << "Shape of node " << node->DebugString();
+    if (kDumpExecutionDetail) {
+      TensorDataTypes input_types;
+      tensorflow::InputTypesForNode(node->def(), node->op_def(), &input_types);
+      TensorPartialShapes input_shapes;
+      for (int i = 0; i < node_ctx->num_inputs(); ++i) {
+        tensorflow::TensorShapeProto proto;
+        node_ctx->ShapeHandleToProto(node_ctx->input(i), &proto);
+        input_shapes.emplace_back(proto);
+        LOG(INFO) << "    input " << i << ": " << tensorflow::DataTypeString(input_types[i])
+                  << node_ctx->DebugString(node_ctx->input(i));
+      }
+    }
+
+    TensorDataTypes input_types;
+    TensorDataTypes output_types;
+    tensorflow::InOutTypesForNode(node->def(), node->op_def(), &input_types, &output_types);
+
+    if (!input_types.empty()) {
+      tensorflow::AttrValue input_desc_attrs;
+      bool input_desc_incomplete = false;
+      for (auto edge : node->in_edges()) {
+        if (!edge->IsControlEdge()) {
+          auto input_attr = edge->src()->attrs().Find(kOutputDesc);
+          if (input_attr == nullptr) {
+            input_desc_incomplete = true;
+            LOG(WARNING) << node->DebugString() << " input node " << edge->src()->DebugString()
+                         << " has no desc for output " << edge->src_output();
+            break;
+          }
+          *input_desc_attrs.mutable_list()->add_func() =
+              edge->src()->attrs().Find(kOutputDesc)->list().func(edge->src_output());
+        }
+      }
+      if (!input_desc_incomplete) {
+        node->AddAttr(kInputDesc, input_desc_attrs);
+      } else {
+        TensorPartialShapes input_shapes;
+        for (int i = 0; i < node_ctx->num_inputs(); ++i) {
+          tensorflow::TensorShapeProto proto;
+          node_ctx->ShapeHandleToProto(node_ctx->input(i), &proto);
+          input_shapes.emplace_back(proto);
+        }
+        AssembleInputDesc(input_shapes, input_types, node);
+      }
+    }
+
+    if (!output_types.empty()) {
+      TensorPartialShapes output_shapes;
+      for (int i = 0; i < node_ctx->num_outputs(); ++i) {
+        tensorflow::TensorShapeProto proto;
+        node_ctx->ShapeHandleToProto(node_ctx->output(i), &proto);
+        output_shapes.emplace_back(proto);
+        DLOG() << "    output " << i << ": " << tensorflow::DataTypeString(output_types[i])
+               << node_ctx->DebugString(node_ctx->output(i));
+      }
+      AssembleOutputDesc(output_shapes, output_types, node);
+    }
+  };
+  tensorflow::ReverseDFS(*graph, {}, node_shape_inference_lambda);
+  return tensorflow::Status::OK();
+}
+
+TFE_TensorHandle *NpuDevice::NewDeviceTensorHandle(TFE_Context *context, Format fmt,
+                                                   const tensorflow::TensorShape &shape, tensorflow::DataType type,
+                                                   TF_Status *status) {
+  NpuManagedBuffer *npu_managed_buffer;
+  NPU_CTX_REQUIRES_OK_RETURN(status, NpuManagedBuffer::Create(fmt, shape, type, &npu_managed_buffer), nullptr);
+  std::vector<int64_t> dims;
+  for (auto dim_size : shape.dim_sizes()) { dims.emplace_back(dim_size); }
+  return TFE_NewTensorHandleFromDeviceMemory(context, device_name.c_str(), static_cast<TF_DataType>(type), dims.data(),
+                                             dims.size(), npu_managed_buffer, sizeof(npu_managed_buffer),
+                                             &NpuManagedBufferDeallocator, nullptr, status);
+}
+
+TFE_TensorHandle *NpuDevice::NewDeviceResourceHandle(TFE_Context *context, const tensorflow::TensorShape &shape,
+                                                     TF_Status *status) {
+  tensorflow::Tensor tensor(tensorflow::DT_RESOURCE, shape);
+  tensorflow::CustomDevice *custom_device = nullptr;
+  NPU_CTX_REQUIRES_RETURN(status, npu::UnwrapCtx(context)->FindCustomDeviceFromName(device_name, &custom_device),
+                          tensorflow::errors::Internal("No custom device registered with name ", device_name), nullptr);
+  return tensorflow::wrap(
+      tensorflow::TensorHandle::CreateLocalHandle(std::move(tensor), custom_device, npu::UnwrapCtx(context)));
+}
+
+TFE_TensorHandle *NpuDevice::CopyTensorD2H(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status) {
+  const tensorflow::Tensor *npu_tensor;
+  NPU_CTX_REQUIRES_OK_RETURN(status, npu::UnwrapTensor(tensor, &npu_tensor), nullptr);
+
+  if (npu_tensor->dtype() == tensorflow::DT_RESOURCE) {
+    tensorflow::ResourceHandle handle = npu_tensor->scalar<tensorflow::ResourceHandle>()();
+    status->status =
+        tensorflow::errors::Internal("Resources ", handle.DebugString(), " cannot be copied across devices[NPU->CPU]");
+    return nullptr;
+  }
+
+  const tensorflow::Tensor *local_tensor;
+  TFE_TensorHandle *local_handle = tensorflow::wrap(
+      tensorflow::TensorHandle::CreateLocalHandle(tensorflow::Tensor(npu_tensor->dtype(), npu_tensor->shape())));
+  NPU_CTX_REQUIRES_RETURN(status, local_handle != nullptr, tensorflow::errors::Internal("Failed create local handle"),
+                          nullptr);
+  NPU_CTX_REQUIRES_OK_RETURN(status, npu::UnwrapTensor(local_handle, &local_tensor), nullptr);
+  NPU_CTX_REQUIRES_OK_RETURN(status, npu::Unwrap<NpuManagedBuffer>(npu_tensor)->AssembleTo(local_tensor), local_handle);
+  return local_handle;
+}
+
+TFE_TensorHandle *NpuDevice::CopyTensorH2D(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status) {
+  return CopyTensorH2D(context, tensor, Format::FORMAT_ND, status);
+}
+
+TFE_TensorHandle *NpuDevice::CopyTensorH2D(TFE_Context *context, TFE_TensorHandle *tensor, Format fmt,
+                                           TF_Status *status) {
+  TFE_TensorHandle *local_handle = tensor;
+  std::vector<TFE_TensorHandle *> copied_tensor_handles;
+  if (!IsCpuTensorHandle(npu::UnwrapHandle(tensor))) {
+    local_handle = TFE_TensorHandleCopyToDevice(tensor, context, underlying_device.c_str(), status);
+    copied_tensor_handles.push_back(local_handle);
+  }
+
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  const tensorflow::Tensor *local_tensor = nullptr;
+  NPU_CTX_REQUIRES_OK_RETURN(status, npu::UnwrapTensor(local_handle, &local_tensor), nullptr);
+  if (local_tensor->dtype() == tensorflow::DT_RESOURCE) {
+    tensorflow::ResourceHandle handle = local_tensor->scalar<tensorflow::ResourceHandle>()();
+    status->status =
+        tensorflow::errors::Internal("Resources ", handle.DebugString(), " cannot be copied across devices[CPU->NPU]");
+    return nullptr;
+  }
+
+  TFE_TensorHandle *npu_handle =
+      NewDeviceTensorHandle(context, fmt, local_tensor->shape(), local_tensor->dtype(), status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  const tensorflow::Tensor *npu_tensor = nullptr;
+
+  NPU_CTX_REQUIRES_OK_RETURN(status, npu::UnwrapTensor(npu_handle, &npu_tensor), nullptr);
+  NPU_CTX_REQUIRES_OK_RETURN(status, npu::Unwrap<NpuManagedBuffer>(npu_tensor)->AssembleFrom(local_tensor), npu_handle);
+  for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); }
+  return npu_handle;
+}
+
+tensorflow::Status NpuDevice::InferShape(TFE_Context *context, const tensorflow::OpRegistrationData *op_reg_data,
+                                         const tensorflow::NodeDef &ndef, int num_inputs, TFE_TensorHandle **inputs,
+                                         TensorPartialShapes &shapes, bool &requested_input_value) {
+  requested_input_value = false;
+  NPU_REQUIRES(op_reg_data->shape_inference_fn,
+               tensorflow::errors::Unimplemented("No infer shape function registered for op ", ndef.op()));
+
+  tensorflow::shape_inference::InferenceContext ic(TF_GRAPH_DEF_VERSION, ndef, op_reg_data->op_def,
+                                                   std::vector<tensorflow::shape_inference::ShapeHandle>(num_inputs),
+                                                   {}, {}, {});
+  NPU_REQUIRES_OK(ic.construction_status());
+  for (int i = 0; i < num_inputs; i++) {
+    auto input = npu::UnwrapHandle(inputs[i]);
+    tensorflow::shape_inference::ShapeHandle shape;
+    NPU_REQUIRES_OK(input->InferenceShape(&ic, &shape));
+    ic.SetInput(i, shape);
+  }
+
+  for (int i = 0; i < num_inputs; i++) {
+    auto input = inputs[i];
+    if (npu::UnwrapHandle(input)->DataType() == tensorflow::DT_RESOURCE) {
+      const tensorflow::Tensor *tensor;
+      NPU_REQUIRES_OK(npu::UnwrapTensor(input, &tensor));
+      auto handle = tensor->flat<tensorflow::ResourceHandle>()(0);
+      const auto &dtypes_and_shapes = handle.dtypes_and_shapes();
+      std::vector<tensorflow::shape_inference::ShapeAndType> inference_shapes_and_types;
+      for (auto &dtype_and_shape : dtypes_and_shapes) {
+        std::vector<tensorflow::shape_inference::DimensionHandle> dims_handle(dtype_and_shape.shape.dims());
+        for (size_t j = 0; j < dims_handle.size(); j++) {
+          dims_handle[j] = ic.MakeDim(dtype_and_shape.shape.dim_size(j));
+        }
+        inference_shapes_and_types.emplace_back(ic.MakeShape(dims_handle), dtype_and_shape.dtype);
+      }
+      ic.set_input_handle_shapes_and_types(i, inference_shapes_and_types);
+      requested_input_value = true;
+    }
+  }
+  // We need to feed the input tensors. TensorFlow performs inference based on the input shape for the first time.
+  // If the shape function of an operator depends on the value of the input tensor, the shape function is marked for the
+  // first time and the actual tensor value is used for inference for the second time.
+  NPU_REQUIRES_OK(ic.Run(op_reg_data->shape_inference_fn));
+
+  std::vector<const tensorflow::Tensor *> input_tensors;
+  input_tensors.resize(num_inputs);
+  std::vector<TFE_TensorHandle *> copied_tensor_handles;
+  bool input_requested = false;
+  for (int i = 0; i < num_inputs; i++) {
+    auto input = inputs[i];
+    if (ic.requested_input_tensor(i)) {  // If requested, this must be a normal tensor
+      if (IsNpuTensorHandle(npu::UnwrapHandle(input))) {
+        auto s = TF_NewStatus();
+        if (s == nullptr) { continue; }
+        input = CopyTensorD2H(context, input, s);
+        if (TF_GetCode(s) != TF_OK) {
+          TF_DeleteStatus(s);
+          continue;
+        }
+        DLOG() << "Copying " << ndef.op() << " input:" << i << " from NPU to CPU for infer shape";
+        copied_tensor_handles.push_back(input);
+      }
+      const tensorflow::Tensor *tensor;
+      NPU_REQUIRES_OK(npu::UnwrapTensor(input, &tensor));
+      input_tensors[i] = tensor;
+      input_requested = true;
+      requested_input_value = true;
+    }
+  }
+  if (input_requested) {
+    ic.set_input_tensors(input_tensors);
+    NPU_REQUIRES_OK(ic.Run(op_reg_data->shape_inference_fn));
+  }
+
+  for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); }
+
+  for (int i = 0; i < ic.num_outputs(); i++) {
+    shapes.emplace_back(tensorflow::PartialTensorShape());
+    tensorflow::shape_inference::ShapeHandle shape_handle = ic.output(i);
+    auto num_dims = ic.Rank(shape_handle);
+    std::vector<tensorflow::int64> dims;
+    if (num_dims == tensorflow::shape_inference::InferenceContext::kUnknownRank) { continue; }
+    for (auto j = 0; j < num_dims; ++j) { dims.emplace_back(ic.Value(ic.Dim(shape_handle, j))); }
+    NPU_REQUIRES_OK(tensorflow::PartialTensorShape::MakePartialShape(dims.data(), num_dims, &shapes[i]));
+  }
+  return tensorflow::Status::OK();
+}
+
+void NpuDevice::GetOrCreateSpec(TFE_Context *context, const char *op_name, const TFE_OpAttrs *attributes,
+                                int num_inputs, TFE_TensorHandle **inputs, std::shared_ptr<const npu::TaskSpec> *spec,
+                                TF_Status *s) {
+  tensorflow::NodeDef ndef;
+  ndef.set_op(op_name);
+  tensorflow::unwrap(attributes)->FillAttrValueMap(ndef.mutable_attr());
+  bool request_shape = false;
+  GetCachedTaskSpec(ndef, spec, request_shape);
+  if (request_shape) {
+    TensorShapes input_shapes;
+    input_shapes.resize(num_inputs);
+    for (int i = 0; i < num_inputs; i++) {
+      NPU_CTX_REQUIRES_OK(s, npu::UnwrapHandle(inputs[i])->Shape(&input_shapes[i]));
+    }
+    GetCachedTaskSpec(ndef, input_shapes, spec);
+  }
+  if (*spec != nullptr) {
+    DLOG() << "Found cached task spec for " << op_name;
+    DLOG() << (*spec)->DebugString();
+    return;
+  }
+  DLOG() << "No cached task spec for " << op_name << ", start create and cache";
+  // 上面校验resource源头的，都是不可以cache的，因为resource可能在多次调用中来自不同的设备，下面的部分是可以cache的
+  // NodeDef保存节点的属性，比较重要的，对于单算子，则会保存T属性，表达输入输出的type<T>
+  // OpRegistrationData保存算子的IR注册信息，对于单算子，则和RegisterOp传递的信息一致，对于function，则是确定了输入的dataType的
+  tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef();
+  const tensorflow::OpRegistrationData *op_reg_data;
+  NPU_CTX_REQUIRES_OK(s, lib_def->LookUp(op_name, &op_reg_data));
+  bool is_function_op = op_reg_data->is_function_op;
+  // 判断当前算子是否是NPU Device声明支持的算子
+  if (!is_function_op && !Supported(op_name)) {
+    *spec = CacheOpSpec(op_name, op_reg_data, ndef, {}, tensorflow::strings::StrCat("Op unsupported by NPU"));
+    return;
+  }
+  bool is_stateful = op_reg_data->op_def.is_stateful();
+  // 这里获取输出的dataType，对于常规算子，通过NodeDef的T属性确定，对于function op，则是在ret上自带
+  TensorDataTypes data_types;
+  NPU_CTX_REQUIRES_OK(s, tensorflow::OutputTypesForNode(ndef, op_reg_data->op_def, &data_types));
+  // 如果输出的dataType不支持，或者不是支持的ResourceGenerator，则fallback
+  tensorflow::Status compat_status = ValidateOutput(op_name, data_types);
+  if (!compat_status.ok()) {
+    if (is_function_op) {
+      const static uint64_t kInvalidGeGraphId = -1;
+      *spec = CacheFuncSpec(op_name, op_reg_data, ndef, kInvalidGeGraphId, {}, {}, {}, compat_status.error_message());
+      return;
+    } else {
+      *spec = CacheOpSpec(op_name, op_reg_data, ndef, {}, compat_status.error_message());
+      return;
+    }
+  }
+  // 需要进行函数算子的图优化，然后再判断NPU是否兼容
+  if (is_function_op) {  // 对function_op，进行图优化，并固定缓存，如果需要fallback，也在spec中记录fallback的原因
+    const tensorflow::FunctionDef *fdef = lib_def->Find(op_name);
+    std::unique_ptr<tensorflow::Graph> optimize_graph = std::make_unique<tensorflow::Graph>(lib_def);
+    std::unique_ptr<tensorflow::FunctionBody> fbody;
+    tensorflow::ProcessFunctionLibraryRuntime *pflr = npu::UnwrapCtx(context)->pflr();
+    tensorflow::FunctionLibraryRuntime *flr = pflr->GetFLR("/job:localhost/replica:0/task:0/device:CPU:0");
+    FunctionDefToBodyHelper(*fdef, tensorflow::AttrSlice(&ndef.attr()), lib_def, &fbody);
+    CopyGraph(*fbody->graph, optimize_graph.get());
+    std::string file_name_suffix = std::string(op_name) + ".pbtxt";
+    if (kDumpExecutionDetail || kDumpGraph) {
+      WriteTextProto(tensorflow::Env::Default(), "step_0_before_optimize_" + file_name_suffix,
+                     optimize_graph->ToGraphDefDebug());
+    }
+
+    tensorflow::OptimizeGraph(flr, &optimize_graph);
+
+    if (kDumpExecutionDetail || kDumpGraph) {
+      WriteTextProto(tensorflow::Env::Default(), "step_1_after_optimize_" + file_name_suffix,
+                     optimize_graph->ToGraphDefDebug());
+    }
+
+    std::vector<tensorflow::ResourceHandle> dependent_host_resources;
+    NPU_CTX_REQUIRES_OK(
+        s, TransResourceInput2GraphNode(context, optimize_graph.get(), num_inputs, inputs, dependent_host_resources));
+    if (kDumpExecutionDetail || kDumpGraph) {
+      WriteTextProto(tensorflow::Env::Default(), "step_2_after_assemble_resource_node_" + file_name_suffix,
+                     optimize_graph->ToGraphDefDebug());
+    }
+
+    PruneFunction(*fdef, optimize_graph.get());
+
+    // TODO:因为Parser要求打上的属性不是下划线开头，直接标记后，会导致无法回到TF执行，当前先复制一份图用于标记
+    std::unique_ptr<tensorflow::Graph> mark_shape_graph = std::make_unique<tensorflow::Graph>(lib_def);
+    CopyGraph(*optimize_graph, mark_shape_graph.get());
+    DLOG() << "NPU Start inferring shape for function node " << op_name;
+    MarkGraphNodeInOutDesc(context, mark_shape_graph.get(), num_inputs, inputs);
+    FixGraphArgRetvalIndex(mark_shape_graph.get());  // Arg节点可能会被优化掉，因而需要重新排列index，并且prune输入
+
+    if (kDumpExecutionDetail || kDumpGraph) {
+      tensorflow::GraphDef gdef;
+      mark_shape_graph->ToGraphDef(&gdef);
+      tensorflow::FunctionDefLibrary fdef_lib;
+      for (const auto &fn : lib_def->ListFunctionNames()) { *fdef_lib.add_function() = *lib_def->Find(fn); }
+      *gdef.mutable_library() = fdef_lib;
+      WriteTextProto(tensorflow::Env::Default(), "step_3_after_mark_shape_" + file_name_suffix, gdef);
+    }
+    // 因为parser当前约定的附加属性不是匿名属性（非下划线开头，所以这里当前需要拷贝一份新图用于标记parser所需属性）
+    tensorflow::GraphDef function_graph_def;
+    mark_shape_graph->ToGraphDef(&function_graph_def);
+    uint64_t graph_id =
+        kCustomKernelEnabled ? AddGeGraph(context, std::string("tf_function_") + op_name, function_graph_def, s) : 0;
+    if (TF_GetCode(s) != TF_OK) return;
+
+    std::vector<int> remain_indexes;
+    std::vector<TFE_TensorHandle *> pruned_inputs;
+    for (auto node : optimize_graph->nodes()) {
+      if (node->IsArg()) {
+        auto index = node->attrs().Find("index")->i();
+        remain_indexes.push_back(index);
+        pruned_inputs.push_back(inputs[index]);
+      }
+    }
+    FixGraphArgRetvalIndex(optimize_graph.get());  // 必须在保存完remain index后fix arg index
+    DLOG() << std::string("tf_function_") + op_name << " remained input index (0-" << num_inputs - 1 << ") -> "
+           << VecToString(remain_indexes);
+    auto lambda = [remain_indexes](int num_inputs, TFE_TensorHandle **inputs, std::vector<TFE_TensorHandle *> &pruned) {
+      for (auto index : remain_indexes) { pruned.push_back(inputs[index]); }
+    };
+    // 对于function节点，可以将resource的输入NPU兼容性作为缓存项目，校验输入是否被NPU支持，如果类型不支持，或者是CPU的Resouce类型，则不支持
+    // 如果是单算子，则不能缓存，需要在每次dev->Run的时候，校验单算子资源输入的兼容性
+    *spec =
+        CacheFuncSpec(op_name, op_reg_data, ndef, graph_id, std::move(optimize_graph), lambda, dependent_host_resources,
+                      ValidateInput(op_name, pruned_inputs.size(), pruned_inputs.data()).error_message());
+    return;
+  } else {
+    // 进行inferShape，输出可能是unknown shape，所以使用partial shape
+    TensorShapes input_shapes;
+    input_shapes.resize(num_inputs);
+    for (int i = 0; i < num_inputs; i++) {
+      NPU_CTX_REQUIRES_OK(s, npu::UnwrapHandle(inputs[i])->Shape(&input_shapes[i]));
+    }
+    TensorPartialShapes partial_shapes;
+    bool requested_input_value = false;
+    if (!data_types.empty()) {
+      DLOG() << "Infer shape for op " << op_name;
+      tensorflow::Status infer_status =
+          InferShape(context, op_reg_data, ndef, num_inputs, inputs, partial_shapes, requested_input_value);
+      // 如果inferShape失败，或者期望输出数量不对，则fallback回CPU，因为CPU的计算并不依赖inferShape
+      if (!infer_status.ok()) {
+        *spec = CacheOpSpec(op_name, op_reg_data, ndef, input_shapes, partial_shapes, infer_status.error_message());
+        return;
+      }
+    } else {
+      DLOG() << "Skip infer shape for non-output op " << op_name;
+    }
+    const std::string reason = ValidateInput(op_name, num_inputs, inputs).error_message();
+    if (requested_input_value) {
+      *spec = CacheOpSpec(op_name, op_reg_data, ndef, input_shapes, reason);
+    } else {
+      *spec = CacheOpSpec(op_name, op_reg_data, ndef, input_shapes, partial_shapes, reason);
+    }
+    return;
+  }
+}
+
+void NpuDevice::FallbackCPU(TFE_Context *context, const char *op_name, const TFE_OpAttrs *attributes, int num_inputs,
+                            TFE_TensorHandle **inputs, int *num_outputs, TFE_TensorHandle **outputs,
+                            TF_Status *status) {
+  DLOG() << "Start fallback executing " << op_name << " by " << underlying_device;
+  TFE_Op *op(TFE_NewOp(context, op_name, status));
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpAddAttrs(op, attributes);
+  TFE_OpSetDevice(op, underlying_device.c_str(), status);
+  std::vector<TFE_TensorHandle *> copied_tensor_handles;  //最后需要释放掉临时拷贝而来的输入cpu handle
+  for (int j = 0; j < num_inputs; ++j) {
+    TFE_TensorHandle *input = inputs[j];
+    if (IsNpuTensorHandle(npu::UnwrapHandle(input))) {
+      input = CopyTensorD2H(context, input, status);  // 创建完成计数为1
+      copied_tensor_handles.emplace_back(input);
+      if (TF_GetCode(status) != TF_OK) return;
+    }
+    if (kDumpExecutionDetail) {
+      const tensorflow::Tensor *tensor = nullptr;
+      npu::UnwrapTensor(input, &tensor);
+      LOG(INFO) << "    input " << j << "  " << tensor->DebugString();
+    }
+    TFE_OpAddInput(op, input, status);  // add完成计数为2
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+
+  std::vector<TFE_TensorHandle *> op_outputs(*num_outputs);
+  TFE_Execute(op, op_outputs.data(), num_outputs, status);
+  TFE_DeleteOp(op);
+  for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); }
+  if (TF_GetCode(status) != TF_OK) return;
+  for (int i = 0; i < *num_outputs; ++i) { outputs[i] = op_outputs[i]; }
+
+  NpuFallbackHookFunc *hook = nullptr;
+  if (CustomKernelRegistry::Instance().GetFallbackHookFunc(op_name, &hook)) {
+    (*hook)(context, this, op_name, attributes, num_inputs, inputs, *num_outputs, outputs, status);
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+}
+
+void NpuDevice::Execute(const TFE_Op *op, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *s) {
+  auto context = TFE_OpGetContext(op, s);
+  if (TF_GetCode(s) != TF_OK) { return; }
+  auto num_inputs = TFE_OpGetFlatInputCount(op, s);
+  if (TF_GetCode(s) != TF_OK) { return; }
+  std::vector<TFE_TensorHandle *> inputs;
+  for (int i = 0; i < num_inputs; i++) {
+    inputs.push_back(TFE_OpGetFlatInput(op, i, s));
+    if (TF_GetCode(s) != TF_OK) { return; }
+  }
+  auto op_name = TFE_OpGetName(op, s);
+  if (TF_GetCode(s) != TF_OK) { return; }
+  auto attributes = TFE_OpGetAttrs(op);
+  DLOG() << "NPU Start executing " << op_name;
+  // 如果存在一个算子的输入来自多个设备的情况，需要直接报错
+  bool cpu_resource = false;
+  NPU_CTX_REQUIRES_OK(s, ValidateResourcePlacement(op_name, num_inputs, inputs.data(), cpu_resource));
+  // 如果算子有resource输入来自CPU，则必须fallback CPU
+  if (cpu_resource) {
+    DLOG() << "NPU Executing " << op_name << " fallback[input resource from cpu]";
+    FallbackCPU(context, op_name, attributes, inputs.size(), inputs.data(), num_outputs, outputs, s);
+    return;
+  }
+  std::shared_ptr<const npu::TaskSpec> spec;
+  GetOrCreateSpec(context, op_name, attributes, inputs.size(), inputs.data(), &spec, s);
+  if (TF_GetCode(s) != TF_OK) { return; }
+  DLOG() << "NPU Executing " << op_name << " found cached spec " << spec->DebugString();
+  if (spec->ShouldFallback()) {
+    DLOG() << "NPU Executing " << op_name << " fallback[" << spec->FallbackReason() << "]";
+    FallbackCPU(context, op_name, attributes, inputs.size(), inputs.data(), num_outputs, outputs, s);
+    if (TF_GetCode(s) != TF_OK) {
+      LOG(ERROR) << "NPU Executing " << op_name << " fallback failed";
+      std::stringstream ss;
+      ss << spec->DebugString() << std::endl;
+      for (int i = 0; i < num_inputs; i++) {
+        tensorflow::Status status;
+        const tensorflow::Tensor *tensor = nullptr;
+        npu::UnwrapHandle(inputs[i])->DeviceName(&status);
+        npu::UnwrapTensor(inputs[i], &tensor);
+        ss << "input " << i << " " << tensorflow::DataTypeString(tensor->dtype()) << " device "
+           << npu::UnwrapHandle(inputs[i])->DeviceName(&status) << std::endl;
+      }
+      LOG(ERROR) << ss.str();
+    }
+  } else {
+    DLOG() << "NPU Executing " << op_name << " dispatched to npu executor";
+    Run(context, spec, inputs.size(), inputs.data(), num_outputs, outputs, s);
+  }
+}
+
+void NpuDevice::Run(TFE_Context *context, std::shared_ptr<const npu::TaskSpec> spec, int num_inputs,
+                    TFE_TensorHandle **inputs, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status) {
+  if (spec->IsFunctionOp()) {
+    DLOG() << "NPU Executor start executing function op " << spec->Op();
+    RunGraph(context, reinterpret_cast<const npu::FuncSpec *>(spec.get()), num_inputs, inputs, num_outputs, outputs,
+             status);
+  } else {
+    DLOG() << "NPU Executor start executing normal op " << spec->Op();
+    RunOp(context, reinterpret_cast<const npu::OpSpec *>(spec.get()), num_inputs, inputs, num_outputs, outputs, status);
+  }
+}
+
+void NpuDevice::RunOp(TFE_Context *context, const npu::OpSpec *spec, int num_inputs, TFE_TensorHandle **inputs,
+                      int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status) {
+  TensorShapes output_shapes;
+  tensorflow::NodeDef parser_ndef = spec->ParserNodeDef();
+  if (spec->ShouldInferShape()) {
+    DLOG() << "NPU Executing op " << spec->Op() << " need re-infer shape";
+    TensorPartialShapes partial_shapes;
+    bool unused = false;
+    bool should_fallback =
+        !InferShape(context, spec->OpRegistrationData(), spec->NodeDef(), num_inputs, inputs, partial_shapes, unused)
+             .ok();
+    if (!should_fallback) {
+      output_shapes.resize(partial_shapes.size());
+      for (size_t i = 0; i < partial_shapes.size(); i++) {
+        DLOG() << "NPU Executing op " << spec->Op() << " re-infer shape output " << i
+               << partial_shapes[i].DebugString();
+        if (!partial_shapes[i].AsTensorShape(&output_shapes[i])) {
+          should_fallback = true;
+          break;
+        }
+      }
+    }
+    if (should_fallback) {
+      DLOG() << "NPU Executing op " << spec->Op() << " fallback cpu after re-infer shape";
+      tensorflow::AttrBuilder attr_builder;
+      attr_builder.Reset(spec->Op().c_str());
+      attr_builder.BuildNodeDef();
+      auto attrs = spec->NodeDef().attr();
+      for (auto &attr : attrs) { attr_builder.Set(attr.first, attr.second); }
+      FallbackCPU(context, spec->Op().c_str(), tensorflow::wrap(&attr_builder), num_inputs, inputs, num_outputs,
+                  outputs, status);
+      return;
+    }
+    AssembleOutputDesc(output_shapes, spec->OutputTypes(), &parser_ndef);
+  } else {
+    output_shapes = spec->OutputShapes();
+  }
+
+  if (kCustomKernelEnabled) {
+    NpuCustomKernelFunc *custom_kernel = nullptr;
+    if (CustomKernelRegistry::Instance().GetCustomKernelFunc(spec->Op(), &custom_kernel)) {
+      (*custom_kernel)(context, this, spec, output_shapes, parser_ndef, num_inputs, inputs, *num_outputs, outputs,
+                       status);
+      return;
+    }
+  }
+
+  // 输入如果是CPU,此时要转换成NPU
+  std::vector<TFE_TensorHandle *> npu_inputs(num_inputs);
+  std::vector<TFE_TensorHandle *> copied_tensor_handles;
+  for (int i = 0; i < num_inputs; ++i) {
+    TFE_TensorHandle *input = inputs[i];
+    // 到达这里的Resource，要么是CPU的镜像 要么是NPU
+    if (!IsNpuTensorHandle(npu::UnwrapHandle(input))
+        && npu::UnwrapHandle(input)->DataType() != tensorflow::DT_RESOURCE) {
+      tensorflow::Status s;
+      auto src_name = npu::UnwrapHandle(input)->DeviceName(&s);
+      NPU_CTX_REQUIRES_OK(status, s);
+      DLOG() << "Copying " << spec->Op() << " input:" << i
+             << " type:" << tensorflow::DataTypeString(npu::UnwrapHandle(input)->DataType()) << " to NPU from "
+             << src_name << " for acl executing";
+      // 这里需要根据算子选择输入格式了
+      input = CopyTensorH2D(context, input, Format::FORMAT_ND, status);
+      copied_tensor_handles.emplace_back(input);
+      if (TF_GetCode(status) != TF_OK) return;
+    }
+    npu_inputs[i] = input;
+  }
+  const auto &output_types = spec->OutputTypes();
+  for (size_t i = 0; i < output_types.size(); ++i) {
+    if (output_types[i] == tensorflow::DT_RESOURCE) {
+      outputs[i] = NewDeviceResourceHandle(context, output_shapes[i], status);
+      if (TF_GetCode(status) != TF_OK) { return; }
+    } else {
+      outputs[i] = NewDeviceTensorHandle(context, Format::FORMAT_ND, output_shapes[i], output_types[i], status);
+      if (TF_GetCode(status) != TF_OK) { return; }
+    }
+  }
+  /******************************************模拟NPU执行Start************************************/
+  // TODO:下面换成真实的ACL调用即可，当前直接FallbackCPU
+  // npu_inputs 指向NPU内存的TFE_TensorHandle**
+  // outputs 指向NPU内存的TFE_TensorHandle**
+  // parser_ndef 打了输入输出描述的ndef，需要优化，后续直接存储ACL的结构体
+  // copied_tensor_handles 存储临时申请的TFE_TensorHandle对象，除输入输出外，必须在最后显式释放
+  // output_shapes 临时变量，算子的输出shape
+  // spec 待运算算子的说明信息，必定包含InputShapes(),InputTypes(),OutputTypes()，不一定包含OutputShapes()(因为有的算子inferShape依赖输入的值（如reshape），输出shape需要使用上面的output_shapes临时变量)
+
+  /*
+   从TFE_TensorHandle*获取NpuManagedBuffer:
+      const tensorflow::Tensor *npu_tensor = nullptr;
+      NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(npu_inputs[i], &npu_tensor));
+      npu::Unwrap<NpuManagedBuffer>(npu_tensor); // 返回值就是NpuManagedBuffer*
+  */
+  std::vector<TFE_TensorHandle *> acl_inputs(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    const tensorflow::Tensor *npu_tensor = nullptr;
+    NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(npu_inputs[i], &npu_tensor));
+    tensorflow::Tensor cpu_tensor(npu_tensor->dtype(), npu_tensor->shape());
+    if (npu_tensor->dtype() == tensorflow::DT_RESOURCE) {
+      for (int j = 0; j < npu_tensor->NumElements(); j++) {
+        cpu_tensor.flat<tensorflow::ResourceHandle>()(j) =
+            const_cast<tensorflow::Tensor *>(npu_tensor)->flat<tensorflow::ResourceHandle>()(j);
+      }
+    } else {
+      NPU_CTX_REQUIRES_OK(status, npu::Unwrap<NpuManagedBuffer>(npu_tensor)->AssembleTo(&cpu_tensor));
+    }
+    acl_inputs[i] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(cpu_tensor));
+    copied_tensor_handles.push_back(acl_inputs[i]);
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+  /**********调用CPU模拟NPU Start*************/
+  std::vector<TFE_TensorHandle *> acl_outputs(*num_outputs);
+  tensorflow::AttrBuilder attr_builder;
+  attr_builder.Reset(spec->Op().c_str());
+  attr_builder.BuildNodeDef();
+  auto attrs = spec->NodeDef().attr();
+  for (auto &attr : attrs) { attr_builder.Set(attr.first, attr.second); }
+
+  FallbackCPU(context, spec->Op().c_str(), tensorflow::wrap(&attr_builder), num_inputs, acl_inputs.data(), num_outputs,
+              acl_outputs.data(), status);
+  if (TF_GetCode(status) != TF_OK) return;
+  /**********调用CPU模拟NPU End*************/
+  for (int i = 0; i < *num_outputs; ++i) {
+    const tensorflow::Tensor *acl_tensor = nullptr;
+    NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(acl_outputs[i], &acl_tensor));
+    const tensorflow::Tensor *npu_tensor = nullptr;
+    NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(outputs[i], &npu_tensor));
+    if (spec->OutputTypes()[i] == tensorflow::DT_RESOURCE) {
+      for (int j = 0; j < npu_tensor->NumElements(); j++) {
+        const_cast<tensorflow::Tensor *>(npu_tensor)->flat<tensorflow::ResourceHandle>()(j) =
+            acl_tensor->flat<tensorflow::ResourceHandle>()(j);
+      }
+    } else {
+      NPU_CTX_REQUIRES_OK(status, npu::Unwrap<NpuManagedBuffer>(npu_tensor)->AssembleFrom(acl_tensor));
+    }
+    TFE_DeleteTensorHandle(acl_outputs[i]);
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+  /******************************************模拟NPU执行End************************************/
+  DLOG() << "NPU Executing op " << spec->Op() << " succeed by npu excutor";
+  for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); }  // 计数-2
+}
+
+void NpuDevice::RunGraph(TFE_Context *context, const npu::FuncSpec *spec, int tf_num_inputs,
+                         TFE_TensorHandle **tf_inputs, int *num_outputs, TFE_TensorHandle **outputs,
+                         TF_Status *status) {
+  std::vector<TFE_TensorHandle *> pruned_inputs;
+  spec->PruneInputs(tf_num_inputs, tf_inputs, pruned_inputs);
+  int num_inputs = pruned_inputs.size();
+  TFE_TensorHandle **inputs = pruned_inputs.data();
+  // 注意，因为GE当前执行图的时候，输入输出内存都是Host的，所以这里和ACL执行相反，如果输入是NPU，则需要转回CPU，特别的，对于资源类，当前采取的策略是资源入图
+  // 输入如果是NPU,此时要转换成CPU
+  std::vector<TFE_TensorHandle *> npu_inputs(num_inputs);
+  std::vector<TFE_TensorHandle *> copied_tensor_handles;
+  for (int i = 0; i < num_inputs; ++i) {
+    TFE_TensorHandle *input = inputs[i];
+    // 到达这里的Resource，要么是CPU的镜像 要么是NPU
+    if (IsNpuTensorHandle(npu::UnwrapHandle(input))
+        && npu::UnwrapHandle(input)->DataType() != tensorflow::DT_RESOURCE) {
+      tensorflow::Status tf_status;
+      auto src_name = npu::UnwrapHandle(input)->DeviceName(&tf_status);
+      NPU_CTX_REQUIRES_OK(status, tf_status);
+      DLOG() << "Copying " << spec->Op() << " input:" << i
+             << " type:" << tensorflow::DataTypeString(npu::UnwrapHandle(input)->DataType()) << " to " << src_name
+             << " from NPU for graph engine executing";
+      // 这里需要根据算子选择输入格式了
+      input = CopyTensorD2H(context, input, status);
+      copied_tensor_handles.emplace_back(input);
+      if (TF_GetCode(status) != TF_OK) return;
+    }
+    npu_inputs[i] = input;
+  }
+
+  if (kCustomKernelEnabled) {
+    // TODO:这里根据小循环策略修改值
+    int64_t iterations_per_loop = kGlobalLoopSize;
+    size_t num_dependent_resources = spec->DependentHostResources().size();
+    for (const auto &resource : spec->DependentHostResources()) {
+      LOG(INFO) << "Start consume iterator resource " << resource.name() << " " << iterations_per_loop << " times";
+      // 注意，这个callback不能引用捕获，防止中途因为消费某个资源失败而导致coredump
+      auto done = [resource, iterations_per_loop](const tensorflow::Status &s) {
+        LOG(INFO) << "Iterator resource " << resource.name() << " consume " << iterations_per_loop
+                  << " times done with status " << s.ToString();
+      };
+      NPU_CTX_REQUIRES_OK(status, ConsumeIteratorAsync(resource, iterations_per_loop, done));
+    }
+    LOG(INFO) << "Start run ge graph " << spec->GeGraphId() << " pin to cpu, loop size " << iterations_per_loop;
+    npu::Timer timer("Graph engine run ", iterations_per_loop, " times for graph ", spec->GeGraphId());
+    timer.Start();
+    RunGeGraphPin2Cpu(context, spec->GeGraphId(), num_inputs, inputs, spec->OutputTypes(), *num_outputs, outputs,
+                      status);
+    timer.Stop();
+    return;
+  }
+  /******************************************模拟NPU执行Start************************************/
+  // TODO:下面换成真实的GE调用即可，当前直接FallbackCPU
+  // inputs 指向CPU内存的TFE_TensorHandle**
+  // copied_tensor_handles 存储临时申请的TFE_TensorHandle对象，除输入输出外，必须在最后显式释放
+  // output_shapes 临时变量，算子的输出shape
+  // spec 待运算算子的说明信息，必定包含InputShapes(),InputTypes(),OutputTypes(),Graph(),GeGraphId()，不包含OutputShapes()
+
+  std::vector<TFE_TensorHandle *> acl_inputs(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    if (IsNpuTensorHandle(npu::UnwrapHandle(npu_inputs[i]))
+        && npu::UnwrapHandle(npu_inputs[i])->DataType() == tensorflow::DT_RESOURCE) {
+      const tensorflow::Tensor *npu_tensor = nullptr;
+      NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(npu_inputs[i], &npu_tensor));
+      tensorflow::Tensor cpu_tensor(npu_tensor->dtype(), npu_tensor->shape());
+      for (int j = 0; j < npu_tensor->NumElements(); j++) {
+        cpu_tensor.flat<tensorflow::ResourceHandle>()(j) =
+            const_cast<tensorflow::Tensor *>(npu_tensor)->flat<tensorflow::ResourceHandle>()(j);
+      }
+      acl_inputs[i] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(cpu_tensor));
+      copied_tensor_handles.push_back(acl_inputs[i]);
+    } else {
+      acl_inputs[i] = npu_inputs[i];
+    }
+  }
+  /**********调用CPU模拟NPU Start*************/
+  std::vector<TFE_TensorHandle *> acl_outputs(*num_outputs);
+  tensorflow::FunctionDef optimized_fdef;
+  tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef();
+  auto fdef = lib_def->Find(spec->Op());
+  auto lookup = [&fdef](const tensorflow::Node *node) -> absl::optional<std::string> {
+    for (const auto &control_ret : fdef->control_ret()) {
+      if (control_ret.second == node->name()) { return absl::make_optional(node->name()); }
+    }
+    return absl::nullopt;
+  };
+  std::string acl_op_name = std::string(spec->Op()) + "_npu_optimized";
+  tensorflow::GraphToFunctionDef(*spec->Graph(), acl_op_name, lookup, &optimized_fdef);
+  lib_def->RemoveFunction(acl_op_name);
+  lib_def->AddFunctionDef(optimized_fdef);
+
+  tensorflow::AttrBuilder attr_builder;
+  attr_builder.Reset(spec->Op().c_str());
+  attr_builder.BuildNodeDef();
+  auto attrs = spec->NodeDef().attr();
+  for (auto &attr : attrs) { attr_builder.Set(attr.first, attr.second); }
+
+  FallbackCPU(context, acl_op_name.c_str(), tensorflow::wrap(&attr_builder), num_inputs, acl_inputs.data(), num_outputs,
+              acl_outputs.data(), status);
+  if (TF_GetCode(status) != TF_OK) return;
+  /**********调用CPU模拟NPU End*************/
+  for (int i = 0; i < *num_outputs; ++i) {
+    const tensorflow::Tensor *acl_tensor = nullptr;
+    NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(acl_outputs[i], &acl_tensor));
+    /**********回调Start*********/
+    if (acl_tensor->dtype() == tensorflow::DT_RESOURCE) {
+      outputs[i] = NewDeviceResourceHandle(context, acl_tensor->shape(), status);
+      if (TF_GetCode(status) != TF_OK) { return; }
+    } else {
+      outputs[i] = NewDeviceTensorHandle(context, Format::FORMAT_ND, acl_tensor->shape(), acl_tensor->dtype(), status);
+      if (TF_GetCode(status) != TF_OK) { return; }
+    }
+    /**********回调End*********/
+    const tensorflow::Tensor *npu_tensor = nullptr;
+    NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(outputs[i], &npu_tensor));
+    if (acl_tensor->dtype() == tensorflow::DT_RESOURCE) {
+      for (int j = 0; j < npu_tensor->NumElements(); j++) {
+        const_cast<tensorflow::Tensor *>(npu_tensor)->flat<tensorflow::ResourceHandle>()(j) =
+            acl_tensor->flat<tensorflow::ResourceHandle>()(j);
+      }
+    } else {
+      NPU_CTX_REQUIRES_OK(status, npu::Unwrap<NpuManagedBuffer>(npu_tensor)->AssembleFrom(acl_tensor));
+    }
+    TFE_DeleteTensorHandle(acl_outputs[i]);
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+  /******************************************模拟NPU执行End************************************/
+  DLOG() << "NPU Executing function op " << spec->Op() << " succeed by npu executor";
+  for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); }  // 计数-2
+}
+
+void NpuDevice::RunGeGraphAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs,
+                                bool pin_to_npu, const TensorDataTypes &output_types, int num_outputs,
+                                TFE_TensorHandle **outputs, DoneCallback done, TF_Status *status) {
+  std::vector<ge::InputTensorInfo> ge_inputs;
+
+  DLOG() << "Ge graph " << graph_id << " input info";
+  for (int i = 0; i < num_inputs; i++) {
+    const tensorflow::Tensor *tensor = nullptr;
+    npu::UnwrapTensor(inputs[i], &tensor);
+
+    const static std::shared_ptr<domi::ModelParser> parser =
+        domi::ModelParserFactory::Instance()->CreateModelParser(domi::FrameworkType::TENSORFLOW);
+    if (parser == nullptr) {
+      status->status = tensorflow::errors::Internal("NPU Create new tensorflow model parser failed");
+      return;
+    }
+    ge::DataType ge_type = parser->ConvertToGeDataType(static_cast<uint32_t>(tensor->dtype()));
+    NPU_CTX_REQUIRES(status, ge_type != ge::DT_UNDEFINED,
+                     tensorflow::errors::InvalidArgument("Failed map tensorflow data type ",
+                                                         tensorflow::DataTypeString(tensor->dtype()),
+                                                         " to ge data type"));
+    ge::InputTensorInfo input;
+    input.data_type = static_cast<uint32_t>(ge_type);
+    for (auto dim_size : tensor->shape().dim_sizes()) { input.dims.emplace_back(dim_size); }
+    input.data = const_cast<char *>(tensor->tensor_data().data());
+    input.length = tensor->TotalBytes();
+    ge_inputs.emplace_back(input);
+    DLOG() << "    input " << i << " ge enum " << input.data_type << " tf type "
+           << tensorflow::DataTypeString(tensor->dtype()) << VecToString(input.dims);
+  }
+  auto ge_callback = [&, graph_id](ge::Status s, std::vector<ge::OutputTensorInfo> &ge_outputs) {
+    if (s == ge::END_OF_SEQUENCE) {
+      done(tensorflow::errors::OutOfRange("Graph engine process graph ", graph_id, " reach end of sequence"));
+      return;
+    } else if (s != ge::SUCCESS) {
+      std::string err_msg = ge::StatusFactory::Instance()->GetErrDesc(s);
+      if (err_msg.empty()) { err_msg = "<unknown error> code:" + std::to_string(s); }
+      done(tensorflow::errors::Internal("Graph engine process graph failed: ", err_msg));
+      return;
+    } else if (ge_outputs.size() != num_outputs) {
+      done(tensorflow::errors::Internal("Graph engine process graph succeed but output num ", ge_outputs.size(),
+                                        " mismatch with expected ", num_outputs));
+      return;
+    }
+
+    DLOG() << "Ge graph " << graph_id << " output info";
+    for (size_t i = 0; i < ge_outputs.size(); i++) {
+      auto &ge_tensor = ge_outputs[i];
+      std::vector<tensorflow::int64> dims;
+      for (auto dim_size : ge_tensor.dims) { dims.push_back(dim_size); }
+      tensorflow::TensorShape shape;
+      tensorflow::Status tf_status = tensorflow::TensorShapeUtils::MakeShape(dims.data(), dims.size(), &shape);
+      if (!tf_status.ok()) {
+        done(tensorflow::errors::Internal("Graph engine process graph succeed but output ", i, " dims invalid ",
+                                          VecToString(ge_tensor.dims), " ", tf_status.error_message()));
+        return;
+      }
+      DLOG() << "    output " << i << " ge type enum " << ge_tensor.data_type << " tf type "
+             << tensorflow::DataTypeString(output_types[i]) << shape.DebugString();
+
+      const static int64_t kTensorAlignBytes = 64;
+      if (reinterpret_cast<uintptr_t>(ge_tensor.data.get()) % kTensorAlignBytes == 0) {
+        DLOG() << "Zero copy ge tensor " << reinterpret_cast<uintptr_t>(ge_tensor.data.get()) << " as aligned with "
+               << kTensorAlignBytes << " bytes";
+        tensorflow::Allocator *allocator = NpuHostFixedAllocator<uint8_t[]>::Create(std::move(ge_tensor.data));
+        tensorflow::Tensor cpu_tensor(allocator, output_types[i], shape);
+        if (ge_tensor.length != cpu_tensor.TotalBytes()) {
+          done(tensorflow::errors::Internal("Graph engine process graph succeed but output ", i, " total bytes ",
+                                            ge_tensor.length, " mismatch with expected ", cpu_tensor.TotalBytes()));
+          return;
+        }
+        outputs[i] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(cpu_tensor));
+      } else {
+        DLOG() << "Skip zero copy as ge tensor " << reinterpret_cast<uintptr_t>(ge_tensor.data.get())
+               << " not aligned with " << kTensorAlignBytes << " bytes";
+        tensorflow::Tensor cpu_tensor(output_types[i], shape);
+        if (ge_tensor.length != cpu_tensor.TotalBytes()) {
+          done(tensorflow::errors::Internal("Graph engine process graph succeed but output ", i, " total bytes ",
+                                            ge_tensor.length, " mismatch with expected ", cpu_tensor.TotalBytes()));
+          return;
+        }
+        memcpy(const_cast<char *>(cpu_tensor.tensor_data().data()), ge_tensor.data.get(), ge_tensor.length);
+        outputs[i] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(cpu_tensor));
+      }
+
+      if (pin_to_npu) {
+        TFE_TensorHandle *handle = outputs[i];
+        outputs[i] = CopyTensorH2D(context, handle, status);
+        TFE_DeleteTensorHandle(handle);
+        if (TF_GetCode(status) != TF_OK) {
+          done(tensorflow::Status(status->status.code(),
+                                  std::string("Graph engine process graph succeed but copy output ") + std::to_string(i)
+                                      + " to npu failed " + status->status.error_message()));
+          return;
+        }
+      }
+    }
+    done(tensorflow::Status::OK());
+  };
+  NPU_CTX_REQUIRES_GE_OK(status, "NPU Schedule graph to graph engine",
+                         ge_session_->RunGraphAsync(graph_id, ge_inputs, ge_callback));
+}
+
+uint64_t NpuDevice::AddGeGraph(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &def,
+                               TF_Status *status) {
+  uint64_t graph_id = NextUUID();
+  auto ge_compute_graph = std::make_shared<ge::ComputeGraph>(name);
+  std::shared_ptr<domi::ModelParser> parser =
+      domi::ModelParserFactory::Instance()->CreateModelParser(domi::FrameworkType::TENSORFLOW);
+  if (parser == nullptr) {
+    status->status = tensorflow::errors::Internal("NPU Create new tensorflow model parser failed");
+    return graph_id;
+  }
+
+  auto request_subgraph = [this, name, context](const google::protobuf::Message *root_proto,
+                                                const std::string &fn) -> std::unique_ptr<google::protobuf::Message> {
+    DLOG() << "Tensorflow model parser requesting subgraph " << fn << " for ge graph " << name;
+    tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef();
+    const tensorflow::FunctionDef *fdef = lib_def->Find(fn);
+    if (fdef == nullptr) { return nullptr; }
+    std::unique_ptr<tensorflow::FunctionBody> fbody;
+    auto status = FunctionDefToBodyHelper(*fdef, tensorflow::AttrSlice{}, lib_def, &fbody);
+    if (!status.ok()) {
+      LOG(ERROR) << "Failed trans function body to graph";
+      return nullptr;
+    }
+
+    tensorflow::ProcessFunctionLibraryRuntime *pflr = npu::UnwrapCtx(context)->pflr();
+    tensorflow::FunctionLibraryRuntime *flr = pflr->GetFLR("/job:localhost/replica:0/task:0/device:CPU:0");
+
+    std::unique_ptr<tensorflow::Graph> graph = std::make_unique<tensorflow::Graph>(lib_def);
+    CopyGraph(*fbody->graph, graph.get());
+    tensorflow::OptimizeGraph(flr, &graph);
+
+    PruneFunction(*fdef, graph.get());
+
+    MarkGraphNodeInOutDesc(context, graph.get(), 0, nullptr);
+    std::unique_ptr<google::protobuf::Message> subgraph;
+    subgraph.reset(new (std::nothrow) tensorflow::GraphDef());
+    if (subgraph != nullptr) { graph->ToGraphDef(reinterpret_cast<tensorflow::GraphDef *>(subgraph.get())); }
+    if (kDumpExecutionDetail || kDumpGraph) {
+      WriteTextProto(tensorflow::Env::Default(), name + "_subgraph_" + fn + ".pbtxt", *subgraph);
+    }
+    return subgraph;
+  };
+
+  NPU_CTX_REQUIRES_GE_OK_RETURN(status, "NPU Parse tensorflow model",
+                                parser->ParseProtoWithSubgraph(&def, request_subgraph, ge_compute_graph), graph_id);
+
+  ge::Graph ge_graph = ge::GraphUtils::CreateGraphFromComputeGraph(ge_compute_graph);
+  NPU_CTX_REQUIRES_GE_OK_RETURN(status, "Graph engine Add graph", GeSession()->AddGraph(graph_id, ge_graph), graph_id);
+  return graph_id;
+}
+
+void NpuDevice::RemoveGeGraph(TFE_Context *context, uint64_t graph_id, TF_Status *status) {
+  NPU_CTX_REQUIRES_GE_OK(status, "Graph engine Remove graph", GeSession()->RemoveGraph(graph_id));
+}
+
+void NpuDevice::RunGeGraph(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs,
+                           bool pin_to_npu, const TensorDataTypes &output_types, int num_outputs,
+                           TFE_TensorHandle **outputs, TF_Status *status) {
+  tensorflow::Notification notification;
+  auto done = [status, &notification](tensorflow::Status s) {
+    status->status = std::move(s);
+    notification.Notify();
+  };
+  RunGeGraphAsync(context, graph_id, num_inputs, inputs, pin_to_npu, output_types, num_outputs, outputs, done, status);
+  notification.WaitForNotification();
+}
+
+void NpuDevice::RunGeGraphPin2CpuAsync(TFE_Context *context, uint64_t graph_id, int num_inputs,
+                                       TFE_TensorHandle **inputs, const TensorDataTypes &output_types, int num_outputs,
+                                       TFE_TensorHandle **outputs, DoneCallback done, TF_Status *status) {
+  RunGeGraphAsync(context, graph_id, num_inputs, inputs, false, output_types, num_outputs, outputs, std::move(done),
+                  status);
+}
+
+void NpuDevice::RunGeGraphPin2NpuAsync(TFE_Context *context, uint64_t graph_id, int num_inputs,
+                                       TFE_TensorHandle **inputs, const TensorDataTypes &output_types, int num_outputs,
+                                       TFE_TensorHandle **outputs, DoneCallback done, TF_Status *status) {
+  RunGeGraphAsync(context, graph_id, num_inputs, inputs, true, output_types, num_outputs, outputs, std::move(done),
+                  status);
+}
+
+void NpuDevice::RunGeGraphPin2Cpu(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs,
+                                  const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs,
+                                  TF_Status *status) {
+  RunGeGraph(context, graph_id, num_inputs, inputs, false, output_types, num_outputs, outputs, status);
+}
+
+void NpuDevice::RunGeGraphPin2Npu(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs,
+                                  const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs,
+                                  TF_Status *status) {
+  RunGeGraph(context, graph_id, num_inputs, inputs, true, output_types, num_outputs, outputs, status);
+}
+
+void NpuDevice::RunGeGraphAnonymous(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &gdef,
+                                    int num_inputs, TFE_TensorHandle **inputs, bool pin_to_npu, int num_outputs,
+                                    TFE_TensorHandle **outputs, TF_Status *status) {
+  uint64_t graph_id = AddGeGraph(context, name, gdef, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::map<int, tensorflow::DataType> indexed_types;
+
+  for (const auto &node : gdef.node()) {
+    if (node.op() == "_Retval") {
+      tensorflow::DataType type;
+      tensorflow::GetNodeAttr(node, "T", &type);
+      int index;
+      tensorflow::GetNodeAttr(node, "index", &index);
+      indexed_types[index] = type;
+    }
+  }
+  TensorDataTypes types;
+  for (auto indexed_type : indexed_types) { types.emplace_back(indexed_type.second); }
+
+  RunGeGraph(context, graph_id, num_inputs, inputs, pin_to_npu, types, num_outputs, outputs, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  RemoveGeGraph(context, graph_id, status);
+  if (TF_GetCode(status) != TF_OK) return;
+}
+
+void NpuDevice::RunGeGraphPin2CpuAnonymous(TFE_Context *context, const std::string &name,
+                                           const tensorflow::GraphDef &gdef, int num_inputs, TFE_TensorHandle **inputs,
+                                           int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) {
+  RunGeGraphAnonymous(context, name, gdef, num_inputs, inputs, false, num_outputs, outputs, status);
+}
+
+void NpuDevice::RunGeGraphPin2NpuAnonymous(TFE_Context *context, const std::string &name,
+                                           const tensorflow::GraphDef &gdef, int num_inputs, TFE_TensorHandle **inputs,
+                                           int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) {
+  RunGeGraphAnonymous(context, name, gdef, num_inputs, inputs, true, num_outputs, outputs, status);
+}
+
+void NpuDevice::GetCachedTaskSpec(const tensorflow::NodeDef &ndef, std::shared_ptr<const npu::TaskSpec> *spec,
+                                  bool &request_shape) {
+  *spec = nullptr;
+  const auto &op = ndef.op();
+  if (cached_func_specs_.find(op) == cached_func_specs_.end()) {
+    HashKey attr_hash = Hash(ndef);
+    request_shape = cached_op_specs_.count(op) && cached_op_specs_[op].count(attr_hash);
+    return;
+  }
+  *spec = cached_func_specs_[op];
+}
+
+void NpuDevice::GetCachedTaskSpec(const tensorflow::NodeDef &ndef, const TensorShapes &shapes,
+                                  std::shared_ptr<const npu::TaskSpec> *spec) {
+  *spec = nullptr;
+  bool request_shape = false;
+  GetCachedTaskSpec(ndef, spec, request_shape);
+  if (*spec != nullptr) { return; }
+  if (!request_shape) { return; }
+  HashKey attr_hash = Hash(ndef);
+  HashKey shape_hash = Hash(shapes);
+  const auto &op = ndef.op();
+  if (cached_op_specs_.count(op) && cached_op_specs_[op].count(attr_hash)
+      && cached_op_specs_[op][attr_hash].count(shape_hash)) {
+    *spec = cached_op_specs_[op][attr_hash][shape_hash];
+  }
+}
+
+std::shared_ptr<const npu::TaskSpec>
+NpuDevice::CacheFuncSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, const tensorflow::NodeDef &ndef,
+                         uint64_t ge_graph_id, std::unique_ptr<const tensorflow::Graph> graph,
+                         const npu::FuncSpec::PruneInputsFunc &prune_func,
+                         const std::vector<tensorflow::ResourceHandle> &dependent_host_resources,
+                         const std::string &reason) {
+  auto spec = std::make_shared<npu::FuncSpec>(op_spec, ndef, ge_graph_id, std::move(graph), prune_func,
+                                              dependent_host_resources, reason);
+  cached_func_specs_[op] = spec;
+  DLOG() << "Cache function op spec " << spec->DebugString();
+  return spec;
+}
+
+std::shared_ptr<const npu::TaskSpec>
+NpuDevice::CacheOpSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, const tensorflow::NodeDef &ndef,
+                       const TensorShapes &input_shapes, const TensorPartialShapes &output_shapes,
+                       const std::string &reason) {
+  auto spec = std::make_shared<npu::OpSpec>(op_spec, ndef, input_shapes, output_shapes, reason);
+  cached_op_specs_[op][Hash(ndef)][Hash(input_shapes)] = spec;
+  DLOG() << "Cache op spec " << spec->DebugString();
+  return spec;
+}
+
+std::shared_ptr<const npu::TaskSpec>
+NpuDevice::CacheOpSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, const tensorflow::NodeDef &ndef,
+                       const TensorShapes &input_shapes, const std::string &reason) {
+  auto spec = std::make_shared<npu::OpSpec>(op_spec, ndef, input_shapes, reason);
+  cached_op_specs_[op][Hash(ndef)][Hash(input_shapes)] = spec;
+  DLOG() << "Cache op spec " << spec->DebugString();
+  return spec;
+}
+
+bool NpuDevice::Supported(const std::string &op) {
+  const static std::unordered_set<std::string> kUnsupportedOps = {};
+  return kUnsupportedOps.count(op) == 0;
+}
+
+bool NpuDevice::SupportedResourceGenerator(const std::string &op) {
+  const static std::unordered_set<std::string> kUnsupportedOps = {"VarHandleOp"};
+  return kUnsupportedOps.count(op) != 0;
+}
+
+void NpuDevice::RecordIteratorMirror(const tensorflow::ResourceHandle &src, const TensorPartialShapes &shapes,
+                                     const TensorDataTypes &types) {
+  iterator_mirrors_.emplace(src, std::make_pair(shapes, types));
+}
+
+bool NpuDevice::MirroredIterator(const tensorflow::ResourceHandle &src) {
+  return iterator_mirrors_.find(src) != iterator_mirrors_.end();
+}
+
+bool NpuDevice::Mirrored(const tensorflow::ResourceHandle &src) {
+  // TODO:可能后续还有其他需要mirror的资源，外层判断资源兼容时务必使用这个接口
+  return iterator_mirrors_.find(src) != iterator_mirrors_.end();
+}
+
+tensorflow::Status NpuDevice::GetMirroredIteratorShapesAndTypes(const tensorflow::ResourceHandle &src,
+                                                                TensorPartialShapes &shapes, TensorDataTypes &types) {
+  auto iter = iterator_mirrors_.find(src);
+  if (iter == iterator_mirrors_.end()) {
+    return tensorflow::errors::Internal("Resource ", src.DebugString(), " has not been mirrored");
+  }
+  shapes.assign(iter->second.first.begin(), iter->second.first.end());
+  types.assign(iter->second.second.begin(), iter->second.second.end());
+  return tensorflow::Status::OK();
+}
diff --git a/tf_adapter_2.x/npu_device/core/npu_device.h b/tf_adapter_2.x/npu_device/core/npu_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..012c2d76f174455b9d45e354609a7cc2759b5b7c
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_device.h
@@ -0,0 +1,227 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_DEVICE_H
+#define TENSORFLOW_NPU_DEVICE_H
+
+#include <memory>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+
+#include "framework/omg/parser/model_parser.h"
+#include "framework/omg/parser/parser_factory.h"
+#include "ge/ge_api.h"
+
+#include "npu_cache_spec.h"
+#include "npu_dp.h"
+#include "npu_types.h"
+#include "npu_unwrap.h"
+#include "npu_utils.h"
+
+class NpuDevice {
+  using HashKey = uint64_t;
+
+  using ShapeTasks = std::map<HashKey, std::shared_ptr<npu::OpSpec>>;
+  using AttrTasks = std::map<HashKey, ShapeTasks>;
+  using CachedOpSpecs = std::map<const std::string, AttrTasks>;
+  using CachedFuncSpecs = std::map<const std::string, std::shared_ptr<npu::FuncSpec>>;
+  using DoneCallback = std::function<void(tensorflow::Status)>;
+
+ public:
+  static std::string CreateDevice(const char *name, int device_index,
+                                  const std::map<std::string, std::string> &session_options, NpuDevice **device);
+
+  static void DeleteDevice(void *device);
+
+  void ReleaseResource();
+
+  tensorflow::Status ValidateResourcePlacement(const char *op_name, int num_inputs, TFE_TensorHandle **inputs,
+                                               bool &cpu_resource);
+
+  tensorflow::Status ValidateInput(const char *op_name, int num_inputs, TFE_TensorHandle **inputs);
+
+  tensorflow::Status InferShape(TFE_Context *context, const tensorflow::OpRegistrationData *op_reg_data,
+                                const tensorflow::NodeDef &ndef, int num_inputs, TFE_TensorHandle **inputs,
+                                TensorPartialShapes &shapes, bool &requested_input_value);
+
+  tensorflow::Status ValidateOutput(const char *op_name, const TensorDataTypes &data_types);
+
+  void PruneFunction(const tensorflow::FunctionDef &fdef, tensorflow::Graph *g, bool keep_signature = false);
+
+  void FixGraphArgRetvalIndex(tensorflow::Graph *graph);
+
+  tensorflow::Status TransResourceInput2GraphNode(TFE_Context *context, tensorflow::Graph *graph, int num_inputs,
+                                                  TFE_TensorHandle **inputs,
+                                                  std::vector<tensorflow::ResourceHandle> &dependent_host_resources);
+
+  tensorflow::Status MarkGraphNodeInOutDesc(TFE_Context *context, tensorflow::Graph *graph, int num_inputs,
+                                            TFE_TensorHandle **inputs);
+
+  TFE_TensorHandle *NewDeviceTensorHandle(TFE_Context *context, ge::Format fmt, const tensorflow::TensorShape &shape,
+                                          tensorflow::DataType type, TF_Status *status);
+
+  TFE_TensorHandle *NewDeviceResourceHandle(TFE_Context *context, const tensorflow::TensorShape &shape,
+                                            TF_Status *status);
+
+  TFE_TensorHandle *CopyTensorD2H(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status);
+
+  TFE_TensorHandle *CopyTensorH2D(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status);
+
+  TFE_TensorHandle *CopyTensorH2D(TFE_Context *context, TFE_TensorHandle *tensor, ge::Format fmt, TF_Status *status);
+
+  void GetOrCreateSpec(TFE_Context *context, const char *op_name, const TFE_OpAttrs *attributes, int num_inputs,
+                       TFE_TensorHandle **inputs, std::shared_ptr<const npu::TaskSpec> *spec, TF_Status *s);
+
+  void FallbackCPU(TFE_Context *context, const char *op_name, const TFE_OpAttrs *attributes, int num_inputs,
+                   TFE_TensorHandle **inputs, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status);
+
+  // NPU Device对外的顶层方法
+  void Execute(const TFE_Op *op, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *s);
+
+  void Run(TFE_Context *context, std::shared_ptr<const npu::TaskSpec> spec, int num_inputs, TFE_TensorHandle **inputs,
+           int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status);
+
+  void RunOp(TFE_Context *context, const npu::OpSpec *spec, int num_inputs, TFE_TensorHandle **inputs, int *num_outputs,
+             TFE_TensorHandle **outputs, TF_Status *status);
+
+  void RunGraph(TFE_Context *context, const npu::FuncSpec *spec, int num_inputs, TFE_TensorHandle **inputs,
+                int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status);
+
+  void RunGeGraphAnonymous(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &gdef,
+                           int num_inputs, TFE_TensorHandle **inputs, bool pin_to_npu, int num_outputs,
+                           TFE_TensorHandle **outputs, TF_Status *status);
+
+  void RunGeGraphPin2CpuAnonymous(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &gdef,
+                                  int num_inputs, TFE_TensorHandle **inputs, int num_outputs,
+                                  TFE_TensorHandle **outputs, TF_Status *status);
+
+  void RunGeGraphPin2NpuAnonymous(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &gdef,
+                                  int num_inputs, TFE_TensorHandle **inputs, int num_outputs,
+                                  TFE_TensorHandle **outputs, TF_Status *status);
+
+  uint64_t AddGeGraph(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &def,
+                      TF_Status *status);
+
+  void RemoveGeGraph(TFE_Context *context, uint64_t graph_id, TF_Status *status);
+
+  void RunGeGraph(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, bool pin_to_npu,
+                  const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs, TF_Status *status);
+
+  void RunGeGraphPin2Cpu(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs,
+                         const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs,
+                         TF_Status *status);
+
+  void RunGeGraphPin2Npu(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs,
+                         const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs,
+                         TF_Status *status);
+
+  void RunGeGraphAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs,
+                       bool pin_to_npu, const TensorDataTypes &output_types, int num_outputs,
+                       TFE_TensorHandle **outputs, DoneCallback done, TF_Status *status);
+
+  void RunGeGraphPin2CpuAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs,
+                              const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs,
+                              DoneCallback done, TF_Status *status);
+
+  void RunGeGraphPin2NpuAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs,
+                              const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs,
+                              DoneCallback done, TF_Status *status);
+
+  void GetCachedTaskSpec(const tensorflow::NodeDef &ndef, std::shared_ptr<const npu::TaskSpec> *spec,
+                         bool &request_shape);
+
+  void GetCachedTaskSpec(const tensorflow::NodeDef &ndef, const TensorShapes &shapes,
+                         std::shared_ptr<const npu::TaskSpec> *spec);
+
+  std::shared_ptr<const npu::TaskSpec>
+  CacheFuncSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, const tensorflow::NodeDef &ndef,
+                uint64_t ge_graph_id, std::unique_ptr<const tensorflow::Graph> graph,
+                const npu::FuncSpec::PruneInputsFunc &prune_func,
+                const std::vector<tensorflow::ResourceHandle> &dependent_host_resources, const std::string &reason);
+
+  std::shared_ptr<const npu::TaskSpec> CacheOpSpec(const char *op, const tensorflow::OpRegistrationData *op_spec,
+                                                   const tensorflow::NodeDef &ndef, const TensorShapes &input_shapes,
+                                                   const TensorPartialShapes &output_shapes, const std::string &reason);
+
+  std::shared_ptr<const npu::TaskSpec> CacheOpSpec(const char *op, const tensorflow::OpRegistrationData *op_spec,
+                                                   const tensorflow::NodeDef &ndef, const TensorShapes &input_shapes,
+                                                   const std::string &reason);
+
+  bool Supported(const std::string &op);
+
+  bool SupportedResourceGenerator(const std::string &op);
+
+  void RecordIteratorMirror(const tensorflow::ResourceHandle &src, const TensorPartialShapes &shapes,
+                            const TensorDataTypes &types);
+
+  bool MirroredIterator(const tensorflow::ResourceHandle &src);
+
+  void CreateIteratorProvider(TFE_Context *context, const tensorflow::Tensor *tensor, std::vector<int> device_ids,
+                              TF_Status *status);
+
+  tensorflow::Status ConsumeIteratorSync(const tensorflow::ResourceHandle &resource, int64_t nums);
+
+  tensorflow::Status ConsumeIteratorAsync(const tensorflow::ResourceHandle &resource, int64_t nums,
+                                          const DoneCallback &done);
+
+  bool Mirrored(const tensorflow::ResourceHandle &src);
+
+  tensorflow::Status GetMirroredIteratorShapesAndTypes(const tensorflow::ResourceHandle &src,
+                                                       TensorPartialShapes &shapes, TensorDataTypes &types);
+
+  uint64_t NextUUID() { return uuid.fetch_add(1); }
+
+  ge::Session *GeSession() { return ge_session_; }
+
+  int device_id;
+  tensorflow::string device_name;
+  tensorflow::string underlying_device;
+
+ private:
+  static HashKey Hash(const TensorDataTypes &types) {
+    if (types.empty()) { return 0; }
+    HashKey hash = tensorflow::Hash64(tensorflow::DataTypeString(types[0]));
+    for (size_t i = 1; i < types.size(); i++) {
+      hash = tensorflow::Hash64Combine(hash, tensorflow::Hash64(tensorflow::DataTypeString(types[i])));
+    }
+    return hash;
+  }
+  static HashKey Hash(const TensorShapes &shapes) {
+    if (shapes.empty()) { return 0; }
+    HashKey hash = tensorflow::Hash64(shapes[0].DebugString());
+    for (size_t i = 1; i < shapes.size(); i++) {
+      hash = tensorflow::Hash64Combine(hash, tensorflow::Hash64(shapes[i].DebugString()));
+    }
+    return hash;
+  }
+  static HashKey Hash(const TFE_OpAttrs *attributes) {
+    tensorflow::AttrValueMap attrs;
+    tensorflow::unwrap(attributes)->FillAttrValueMapWithoutDefaults(&attrs);
+    if (attrs.empty()) { return 0; }
+    auto iter = attrs.begin();
+    HashKey hash = tensorflow::Hash64(iter->second.DebugString());
+    iter++;
+    while (iter != attrs.end()) {
+      hash = tensorflow::Hash64Combine(hash, tensorflow::Hash64(iter->second.DebugString()));
+      iter++;
+    }
+    return hash;
+  }
+
+  static HashKey Hash(const tensorflow::NodeDef &ndef) { return tensorflow::Hash64(ndef.DebugString()); }
+
+  ge::Session *ge_session_;
+  std::atomic<uint64_t> uuid{0};
+  CachedOpSpecs cached_op_specs_;
+  CachedFuncSpecs cached_func_specs_;
+  std::map<tensorflow::ResourceHandle, std::pair<TensorPartialShapes, TensorDataTypes>, ResourceCompare>
+      iterator_mirrors_;
+  std::map<tensorflow::ResourceHandle, std::shared_ptr<IteratorResourceProvider>, ResourceCompare> iterator_providers_;
+};
+
+#endif  // TENSORFLOW_C_EAGER_CUSTOM_DEVICE_TESTUTIL_H_
diff --git a/tf_adapter_2.x/npu_device/core/npu_device_register.cpp b/tf_adapter_2.x/npu_device/core/npu_device_register.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7613a8975cf40fa7867f1f7bdb572f152bc6f9c
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_device_register.cpp
@@ -0,0 +1,82 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include "tensorflow/core/platform/logging.h"
+
+#include "npu_device.h"
+#include "npu_logger.h"
+#include "npu_micros.h"
+#include "npu_unwrap.h"
+#include "npu_utils.h"
+
+namespace {
+
+TFE_TensorHandle *CopyTensorToNpuDevice(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status,
+                                        void *device_info) {
+  auto *dev = reinterpret_cast<NpuDevice *>(device_info);
+  tensorflow::Status tf_status;
+  LOG(INFO) << "[CopyTensorToNpuDevice] Copy tensor from " << tensorflow::unwrap(tensor)->DeviceName(&tf_status)
+            << " to " << dev->device_name;
+  TFE_TensorHandle *npu_handle = dev->CopyTensorH2D(context, tensor, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return npu_handle;
+}
+
+TFE_TensorHandle *CopyTensorFromNpuDevice(TFE_Context *context, TFE_TensorHandle *tensor,
+                                          const char *target_device_name, TF_Status *status, void *device_info) {
+  auto *dev = reinterpret_cast<NpuDevice *>(device_info);
+  DLOG() << "[CopyTensorFromNpuDevice] Copy tensor from " << dev->device_name << " to " << target_device_name;
+  // 输入的TensorHandle是NPU的，应当先进行NPU->CPU的传输，再调用TFE_TensorHandleCopyToDevice防止可能的NPU->GPU传输
+  // 一旦Copy动作发生，需要进行stream同步。如果是NPU->NPU的拷贝（理论上不应该发生），可以不同步。
+  TFE_TensorHandle *local_tensor = dev->CopyTensorD2H(context, tensor, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_TensorHandle *target_tensor = TFE_TensorHandleCopyToDevice(local_tensor, context, target_device_name, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  TFE_DeleteTensorHandle(local_tensor);
+  return target_tensor;
+}
+
+void NpuDeviceExecute(const TFE_Op *op, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *s, void *device_info) {
+  auto *dev = reinterpret_cast<NpuDevice *>(device_info);
+  dev->Execute(op, num_outputs, outputs, s);
+}
+
+void DeleteNpuDevice(void *device_info) { NpuDevice::DeleteDevice(device_info); }
+
+void RegisterNpuDevice(TFE_Context *context, const char *name, void *device_info, TF_Status *status) {
+  TFE_CustomDevice custom_device;
+  custom_device.copy_tensor_to_device = &CopyTensorToNpuDevice;
+  custom_device.copy_tensor_from_device = &CopyTensorFromNpuDevice;
+  custom_device.delete_device = &DeleteNpuDevice;
+  custom_device.execute = &NpuDeviceExecute;
+  TFE_RegisterCustomDevice(context, custom_device, name, device_info, status);
+}
+
+std::vector<NpuDevice *> devices_instances;
+}  // namespace
+
+std::string CreateDevice(TFE_Context *context, const char *name, int device_index,
+                         const std::map<std::string, std::string> &session_options) {
+  const static std::string kSucceed;
+
+  NpuDevice *device = nullptr;
+  auto create_status = NpuDevice::CreateDevice(name, device_index, session_options, &device);
+  if (create_status != kSucceed) { return create_status; }
+  devices_instances.push_back(device);
+
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(TF_NewStatus(), TF_DeleteStatus);
+  RegisterNpuDevice(context, name, device, status.get());
+  if (TF_GetCode(status.get()) != TF_OK) {
+    return std::string("Register Npu device ") + name + " failed:" + TF_Message(status.get());
+  }
+  LOG(INFO) << "Npu device instance " << name << " created";
+
+  return kSucceed;
+}
+
+void ReleaseDeviceResource() {
+  for (auto device : devices_instances) { device->ReleaseResource(); }
+}
\ No newline at end of file
diff --git a/tf_adapter_2.x/npu_device/core/npu_device_register.h b/tf_adapter_2.x/npu_device/core/npu_device_register.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b36beb1b2c31081f1d2f3ace6d082afca325ada
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_device_register.h
@@ -0,0 +1,18 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_DEVICE_REGISTER_H_
+#define TENSORFLOW_NPU_DEVICE_REGISTER_H_
+
+#include "tensorflow/c/eager/c_api.h"
+#include <map>
+#include <string>
+
+std::string CreateDevice(TFE_Context *context, const char *device_name, int device_index,
+                 const std::map<std::string, std::string> &session_options);
+
+void ReleaseDeviceResource();
+
+#endif  // TENSORFLOW_C_EAGER_NPU_DEVICE_TESTUTIL_H_
diff --git a/tf_adapter_2.x/npu_device/core/npu_dp.h b/tf_adapter_2.x/npu_device/core/npu_dp.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0a1d92a34defc8e33f77b27865058a84073e1d1
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_dp.h
@@ -0,0 +1,126 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_DP_H
+#define TENSORFLOW_NPU_DP_H
+
+#include "tensorflow/c/c_api.h"
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+
+#include "npu_types.h"
+
+class IteratorResourceProvider {
+  using ConsumeFunc = std::function<tensorflow::Status()>;
+  using DestroyFunc = std::function<tensorflow::Status()>;
+  using DoneCallback = std::function<void(tensorflow::Status)>;
+
+ public:
+  tensorflow::Status Consume(int64_t nums, const DoneCallback &done) {
+    {
+      if (stopped_) { return tensorflow::errors::Internal("Iterator resource provider ", name_, " has stopped"); }
+      std::unique_lock<std::mutex> lk(mu_);
+      if (request_stop_) { return tensorflow::errors::Internal("Iterator resource provider ", name_, " is stopping"); }
+      requests_.emplace(nums, done);
+    }
+    cv_.notify_one();
+    return tensorflow::Status::OK();
+  }
+  tensorflow::Status Destroy() {
+    {
+      std::unique_lock<std::mutex> lk(mu_);
+      request_stop_ = true;
+    }
+    cv_.notify_one();
+    while (!stopped_) {}
+    return destroy_func_();
+  }
+
+  IteratorResourceProvider(std::string name, ConsumeFunc cf, DestroyFunc df)
+      : name_(std::move(name)), consume_func_(std::move(cf)), destroy_func_(std::move(df)), request_stop_(false),
+        stopped_(false) {
+    worker_.reset(
+        tensorflow::Env::Default()->StartThread(tensorflow::ThreadOptions{}, name_ + "_hdc_provider", [this]() {
+          while (true) {
+            std::unique_lock<std::mutex> lk(mu_);
+            cv_.wait(lk, [this]() -> bool { return !requests_.empty() || request_stop_; });
+            if (request_stop_) {
+              stopped_.store(true);
+              return;
+            }
+            auto task = requests_.front();
+            requests_.pop();
+            lk.unlock();
+            int64_t nums = task.first;
+            auto done = task.second;
+            tensorflow::Status status = tensorflow::Status::OK();
+            while (nums-- > 0 && status.ok()) { status = consume_func_(); }
+            done(status);
+          }
+        }));
+  }
+  ~IteratorResourceProvider() {
+    {
+      std::unique_lock<std::mutex> lk(mu_);
+      stopped_ = true;
+    }
+    cv_.notify_one();
+  }
+  static tensorflow::FunctionDef GetFunctionDef(std::string channel_name, std::vector<int> device_ids,
+                                                const TensorPartialShapes &shapes, const TensorDataTypes &types,
+                                                TF_Status *status) {
+    tensorflow::FunctionDef fdef;
+    std::unique_ptr<tensorflow::Graph> graph = std::make_unique<tensorflow::Graph>(tensorflow::OpRegistry::Global());
+
+    tensorflow::Node *arg_iterator = nullptr;
+    tensorflow::Node *iterator_h2d = nullptr;
+
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder("arg_iterator", "_Arg")
+                                   .Attr("index", 0)
+                                   .Attr("T", tensorflow::DT_RESOURCE)
+                                   .Finalize(graph.get(), &arg_iterator),
+                               fdef);
+
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder("iterator_h2d", "IteratorH2D")
+                                   .Input(arg_iterator, 0)
+                                   .Attr("device_ids", device_ids)
+                                   .Attr("channel_name", channel_name)
+                                   .Finalize(graph.get(), &iterator_h2d),
+                               fdef);
+
+    NPU_CTX_REQUIRES_OK_RETURN(status, tensorflow::GraphToFunctionDef(*graph, "dp_provider_" + channel_name, &fdef),
+                               fdef);
+    return fdef;
+  }
+
+ private:
+  std::string name_;
+  ConsumeFunc consume_func_;
+  DestroyFunc destroy_func_;
+  bool request_stop_;
+  std::atomic_bool stopped_{false};
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::queue<std::pair<int64_t, DoneCallback>> requests_;
+  std::unique_ptr<tensorflow::Thread> worker_;
+};
+
+#endif  //TENSORFLOW_NPU_DP_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_env.h b/tf_adapter_2.x/npu_device/core/npu_env.h
new file mode 100644
index 0000000000000000000000000000000000000000..af976b88670ff4c55289793221f2212358f82e38
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_env.h
@@ -0,0 +1,47 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_ENV_H
+#define TENSORFLOW_NPU_ENV_H
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/env_var.h"
+
+const static bool kDumpExecutionDetail = []() -> bool {
+  bool dump_execute_detail = false;
+  tensorflow::ReadBoolFromEnvVar("NPU_DEBUG", false, &dump_execute_detail);
+  return dump_execute_detail;
+}();
+
+const static bool kDumpGraph = []() -> bool {
+  bool dump_graph = false;
+  tensorflow::ReadBoolFromEnvVar("NPU_DUMP_GRAPH", false, &dump_graph);
+  return dump_graph;
+}();
+
+const static bool kCustomKernelEnabled = []() -> bool {
+  bool use_custom_kernel = true;
+  tensorflow::ReadBoolFromEnvVar("NPU_ENABLE_CUSTOM_KERNEL", true, &use_custom_kernel);
+  return use_custom_kernel;
+}();
+
+const static int64_t kGlobalLoopSize = []() -> int64_t {
+  tensorflow::int64 loop_size = 1;
+  tensorflow::ReadInt64FromEnvVar("NPU_LOOP_SIZE", 1, &loop_size);
+  return loop_size;
+}();
+
+const static bool kPerfEnabled = []() -> bool {
+  bool perf_enabled = false;
+  tensorflow::ReadBoolFromEnvVar("NPU_ENABLE_PERF", false, &perf_enabled);
+  return perf_enabled;
+}();
+
+#endif  //TENSORFLOW_NPU_ENV_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_hdc.cpp b/tf_adapter_2.x/npu_device/core/npu_hdc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..da8203ce95367d1a6225320fc7a18d834eb89702
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_hdc.cpp
@@ -0,0 +1,268 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include "npu_hdc.h"
+#include "npu_micros.h"
+
+
+tensorflow::Status MappingTfDtypeToAcl(tensorflow::DataType tf_type, aclDataType &acl_type);
+
+tensorflow::Status MappingAclDtypeToTf(const aclDataType &acl_type, tensorflow::DataType &tf_type);
+
+tensorflow::Status AssembleAclTensor2Tensor(acltdtDataItem *item, std::vector<tensorflow::Tensor> &tensors,
+                                            bool call_by_channel_receive);
+
+tensorflow::Status AssembleAclDataset2Tensors(acltdtDataset *acl_dataset, std::vector<tensorflow::Tensor> &out_tensors,
+                                              bool call_by_channel_receive);
+
+tensorflow::Status AssembleTensors2AclDataset(acltdtTensorType acl_type, const std::vector<tensorflow::Tensor> &tensors,
+                                              acltdtDataset **acl_dataset);
+
+tensorflow::Status AssembleTensors2AclDataset(acltdtTensorType acl_type, const std::vector<tensorflow::Tensor> &tensors,
+                                              acltdtDataset *acl_dataset);
+
+tensorflow::Status DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item = true);
+
+tensorflow::Status RecvTensorByAcl(acltdtChannelHandle *acl_handle, std::vector<tensorflow::Tensor> &tensors);
+
+tensorflow::Status SendTensorsByAcl(acltdtChannelHandle *acl_handle, acltdtTensorType acl_type,
+                                    const std::vector<tensorflow::Tensor> &tensors);
+
+tensorflow::Status MappingAclDtypeToTf(const aclDataType &acl_type, tensorflow::DataType &tf_type) {
+  const static std::map<aclDataType, tensorflow::DataType> type_mapping = {
+      {ACL_FLOAT, tensorflow::DT_FLOAT},   {ACL_FLOAT16, tensorflow::DT_HALF},  {ACL_INT8, tensorflow::DT_INT8},
+      {ACL_INT32, tensorflow::DT_INT32},   {ACL_UINT8, tensorflow::DT_UINT8},   {ACL_INT16, tensorflow::DT_INT16},
+      {ACL_UINT16, tensorflow::DT_UINT16}, {ACL_UINT32, tensorflow::DT_UINT32}, {ACL_INT64, tensorflow::DT_INT64},
+      {ACL_UINT64, tensorflow::DT_UINT64}, {ACL_DOUBLE, tensorflow::DT_DOUBLE}, {ACL_BOOL, tensorflow::DT_BOOL},
+      {ACL_STRING, tensorflow::DT_STRING}};
+  auto found = type_mapping.find(acl_type);
+  if (found == type_mapping.end()) {
+    return tensorflow::errors::Internal("Hdc channel receive unsupported data type", acl_type);
+  }
+  tf_type = found->second;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status AssembleAclTensor2Tensor(acltdtDataItem *item, std::vector<tensorflow::Tensor> &tensors,
+                                            bool call_by_channel_receive) {
+  acltdtTensorType acl_type = acltdtGetTensorTypeFromItem(item);
+  if (acl_type == ACL_TENSOR_DATA_END_OF_SEQUENCE) {
+    LOG(INFO) << "Hdc channel received end-of-sequence for out-feed op.";
+    return tensorflow::Status::OK();
+  } else if (acl_type == ACL_TENSOR_DATA_ABNORMAL) {
+    LOG(INFO) << "Hdc channel received abnormal for out-feed op.";
+    return tensorflow::Status::OK();
+  } else if (acl_type == ACL_TENSOR_DATA_UNDEFINED) {
+    LOG(INFO) << "Hdc channel received undefined message type for out-feed op.";
+    return tensorflow::errors::Internal("Hdc channel received undefined message type for out-feed op.");
+  }
+  tensorflow::DataType tf_type;
+  TF_RETURN_IF_ERROR(MappingAclDtypeToTf(acltdtGetDataTypeFromItem(item), tf_type));
+  size_t dim_num = acltdtGetDimNumFromItem(item);
+  size_t acl_data_len = acltdtGetDataSizeFromItem(item);
+  char *acl_data = reinterpret_cast<char *>(acltdtGetDataAddrFromItem(item));
+  if (call_by_channel_receive) { acl_data = const_cast<char *>(reinterpret_cast<std::string *>(acl_data)->c_str()); }
+  if (tf_type == tensorflow::DT_STRING) {
+    if (dim_num != 0) { return tensorflow::errors::Internal("Hdc channel receive unsupported non-scalar string type"); }
+    tensorflow::Tensor tensor(tf_type, tensorflow::TensorShape({}));
+    tensor.scalar<tensorflow::tstring>()() = std::string(acl_data, acl_data_len);
+    tensors.emplace_back(std::move(tensor));
+  } else if (DataTypeCanUseMemcpy(tf_type)) {
+    std::vector<int64_t> dims;
+    dims.resize(dim_num);
+    if (acltdtGetDimsFromItem(item, dims.data(), dim_num) != ACL_ERROR_NONE) {
+      return tensorflow::errors::Internal("Failed get dim-size from hdc channel data");
+    }
+    tensorflow::TensorShape tf_shape;
+    for (auto dim : dims) { tf_shape.AddDim(dim); }
+    tensorflow::Tensor tensor = tensorflow::Tensor(tf_type, tf_shape);
+    auto tensor_data = const_cast<char *>(tensor.tensor_data().data());
+    auto tensor_size = tensor.tensor_data().size();
+    if (tensor_size != acl_data_len) {
+      return tensorflow::errors::Internal("Hdc channel receive size mismatch tensor size acl:", acl_data_len,
+                                          " vs. tensorflow:", tensor_size);
+    }
+    memcpy(tensor_data, acl_data, tensor_size);
+    tensors.emplace_back(std::move(tensor));
+  } else {
+    return tensorflow::errors::InvalidArgument("Hdc channel receive un-copyable tensorflow data type",
+                                               DataTypeString(tf_type));
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status AssembleAclDataset2Tensors(acltdtDataset *acl_dataset, std::vector<tensorflow::Tensor> &out_tensors,
+                                              bool call_by_channel_receive) {
+  for (size_t i = 0; i < acltdtGetDatasetSize(acl_dataset); i++) {
+    auto acl_data = acltdtGetDataItem(acl_dataset, i);
+    if (acl_data == nullptr) {
+      return tensorflow::errors::Internal("Acl get tensor data from dataset failed when receive tensor data.");
+    }
+    TF_RETURN_IF_ERROR(AssembleAclTensor2Tensor(acl_data, out_tensors, call_by_channel_receive));
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item) {
+  if (include_data_item) {
+    for (size_t i = 0; i < acltdtGetDatasetSize(acl_dataset); i++) {
+      if (acltdtDestroyDataItem(acltdtGetDataItem(acl_dataset, i)) != ACL_ERROR_NONE) {
+        return tensorflow::errors::Internal("Acl destroy tensor data failed.");
+      }
+    }
+  }
+  if (acltdtDestroyDataset(acl_dataset) != ACL_ERROR_NONE) {
+    return tensorflow::errors::Internal("Acl destroy tensor dataset failed.");
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status RecvTensorByAcl(acltdtChannelHandle *acl_handle, std::vector<tensorflow::Tensor> &tensors) {
+  auto acl_dataset = acltdtCreateDataset();
+  if (acl_dataset == nullptr) { return tensorflow::errors::Internal("Failed create hdc channel."); }
+  auto acl_status = acltdtReceiveTensor(acl_handle, acl_dataset, -1 /* no timeout */);
+
+  if (acl_status != ACL_ERROR_NONE) {
+    NPU_LOG_IF_ERROR(DestroyAclDataset(acl_dataset, false));
+    return tensorflow::errors::Internal("Failed receive data from hdc channel, acl status:", acl_status);
+  }
+
+  auto status = AssembleAclDataset2Tensors(acl_dataset, tensors, true /* call by channel receive */);
+  if (!status.ok()) {
+    NPU_LOG_IF_ERROR(DestroyAclDataset(acl_dataset, false));
+    return status;
+  }
+  TF_RETURN_IF_ERROR(DestroyAclDataset(acl_dataset, false));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status MappingTfDtypeToAcl(const tensorflow::DataType tf_type, aclDataType &acl_type) {
+  const static std::map<tensorflow::DataType, aclDataType> type_mapping = {
+      {tensorflow::DT_FLOAT, ACL_FLOAT},   {tensorflow::DT_HALF, ACL_FLOAT16},  {tensorflow::DT_INT8, ACL_INT8},
+      {tensorflow::DT_INT32, ACL_INT32},   {tensorflow::DT_UINT8, ACL_UINT8},   {tensorflow::DT_INT16, ACL_INT16},
+      {tensorflow::DT_UINT16, ACL_UINT16}, {tensorflow::DT_UINT32, ACL_UINT32}, {tensorflow::DT_INT64, ACL_INT64},
+      {tensorflow::DT_UINT64, ACL_UINT64}, {tensorflow::DT_DOUBLE, ACL_DOUBLE}, {tensorflow::DT_BOOL, ACL_BOOL},
+      {tensorflow::DT_STRING, ACL_STRING}};
+  auto found = type_mapping.find(tf_type);
+  if (found == type_mapping.end()) {
+    return tensorflow::errors::Internal("Unsupported tensorflow data type ", DataTypeString(tf_type), " by acl.");
+  }
+  acl_type = found->second;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status AssembleTensors2AclDataset(acltdtTensorType acl_type, const std::vector<tensorflow::Tensor> &tensors,
+                                              acltdtDataset *acl_dataset) {
+  if (TF_PREDICT_FALSE(acl_type != ACL_TENSOR_DATA_TENSOR)) {
+    acltdtDataItem *acl_data = acltdtCreateDataItem(acl_type, nullptr, 0, ACL_BOOL /* whatever */, nullptr, 0);
+    if (acl_data == nullptr) {
+      return tensorflow::errors::Internal("Acl create tensor item failed when send end-of-sequence.");
+    }
+    if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_ERROR_NONE) {
+      if (acltdtDestroyDataItem(acl_data) != ACL_ERROR_NONE) {
+        LOG(ERROR) << "Acl destroy tensor data item failed when send data with type "
+                   << (acl_type == ACL_TENSOR_DATA_END_OF_SEQUENCE ? "ACL_TENSOR_DATA_END_OF_SEQUENCE"
+                                                                   : "ACL_TENSOR_DATA_ABNORMAL");
+      }
+      return tensorflow::errors::Internal("Acl add tensor data to dataset failed when send data with type ", acl_type);
+    }
+    return tensorflow::Status::OK();
+  }
+  for (auto &tensor : tensors) {
+    aclDataType acl_data_type;
+    TF_RETURN_IF_ERROR(MappingTfDtypeToAcl(tensor.dtype(), acl_data_type));
+    acltdtDataItem *acl_data = nullptr;
+    if (DataTypeCanUseMemcpy(tensor.dtype())) {
+      auto dims = tensor.shape().dim_sizes();
+      acl_data = acltdtCreateDataItem(
+          ACL_TENSOR_DATA_TENSOR, (dims.empty() ? nullptr : reinterpret_cast<const int64_t *>(dims.data())),
+          dims.size(), acl_data_type, const_cast<char *>(tensor.tensor_data().data()), tensor.tensor_data().size());
+    } else if (tensor.dtype() == tensorflow::DT_STRING) {
+      if (tensor.dims() != 0) {
+        return tensorflow::errors::Internal("Acl send got unexpected non-scalar string tensor with dim ",
+                                            tensor.dims());
+      }
+      auto value = reinterpret_cast<tensorflow::tstring *>(const_cast<char *>(tensor.tensor_data().data()));
+      // for scalar type, *dims is nullptr and dim_num is 0
+      acl_data = acltdtCreateDataItem(ACL_TENSOR_DATA_TENSOR, nullptr, 0, acl_data_type,
+                                      const_cast<char *>(value->c_str()), value->size());
+    } else {
+      return tensorflow::errors::Internal("Acl send got unexpected data type ", DataTypeString(tensor.dtype()));
+    }
+    if (acl_data == nullptr) {
+      return tensorflow::errors::Internal("Acl create tensor item failed when send tensor data ", tensor.DebugString());
+    }
+    if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_ERROR_NONE) {
+      if (acltdtDestroyDataItem(acl_data) != ACL_ERROR_NONE) {
+        LOG(ERROR) << "Acl destroy tensor data item failed when send data with type ACL_TENSOR_DATA_TENSOR";
+      }
+      return tensorflow::errors::Internal("Acl add tensor data to dataset failed when send tensor data.");
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status AssembleTensors2AclDataset(acltdtTensorType acl_type, const std::vector<tensorflow::Tensor> &tensors,
+                                              acltdtDataset **output_acl_dataset) {
+  auto acl_dataset = acltdtCreateDataset();
+  if (acl_dataset == nullptr) { return tensorflow::errors::Internal("Acl create tensor dataset failed"); }
+  auto status = AssembleTensors2AclDataset(acl_type, tensors, acl_dataset);
+  if (!status.ok()) {
+    NPU_LOG_IF_ERROR(DestroyAclDataset(acl_dataset));
+    return status;
+  }
+  *output_acl_dataset = acl_dataset;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SendTensorsByAcl(acltdtChannelHandle *acl_handle, acltdtTensorType acl_type,
+                                    const std::vector<tensorflow::Tensor> &tensors) {
+  acltdtDataset *acl_dataset = nullptr;
+
+  TF_RETURN_IF_ERROR(AssembleTensors2AclDataset(acl_type, tensors, &acl_dataset));
+
+  auto acl_status = acltdtSendTensor(acl_handle, acl_dataset, -1 /*no timeout*/);
+
+  TF_RETURN_IF_ERROR(DestroyAclDataset(acl_dataset));
+  if (acl_status != ACL_ERROR_NONE) {
+    return tensorflow::errors::Internal("Acl send data failed, acl status:", acl_status);
+  }
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status HdcChannel::Create(uint32_t device_id, const std::string& name,
+                                      std::shared_ptr<HdcChannel> *guarded_channel) {
+  auto channel = new (std::nothrow) HdcChannel(device_id, name);
+  NPU_REQUIRES(channel,
+               tensorflow::errors::Internal("Failed allocate memory for hdc channel ", name, " on device ", device_id));
+  NPU_REQUIRES_OK(channel->Init());
+  guarded_channel->reset(channel);
+  return tensorflow::Status::OK();
+}
+
+HdcChannel::~HdcChannel() {
+  if (acltdtDestroyChannel(handle_) != ACL_ERROR_NONE) {
+    LOG(ERROR) << "Failed close hdc channel " << name_;
+  } else {
+    LOG(INFO) << "Hdc channel " << name_ << " closed";
+  }
+}
+
+tensorflow::Status HdcChannel::SendTensors(const std::vector<tensorflow::Tensor> &tensors) {
+  return SendTensorsByAcl(handle_, ACL_TENSOR_DATA_TENSOR, tensors);
+}
+
+tensorflow::Status HdcChannel::NotifyFinish() { return SendTensorsByAcl(handle_, ACL_TENSOR_DATA_END_OF_SEQUENCE, {}); }
+
+tensorflow::Status HdcChannel::NotifyAbnormal() { return SendTensorsByAcl(handle_, ACL_TENSOR_DATA_ABNORMAL, {}); }
+
+HdcChannel::HdcChannel(uint32_t device_id, std::string name)
+    : handle_(nullptr), device_id_(device_id), name_(std::move(name)) {}
+tensorflow::Status HdcChannel::Init() {
+  handle_ = acltdtCreateChannel(device_id_, name_.c_str());
+  if (handle_ == nullptr) { return tensorflow::errors::Internal("Failed create hdc channel by acl"); }
+  return tensorflow::Status::OK();
+}
\ No newline at end of file
diff --git a/tf_adapter_2.x/npu_device/core/npu_hdc.h b/tf_adapter_2.x/npu_device/core/npu_hdc.h
new file mode 100644
index 0000000000000000000000000000000000000000..aad1e1418c8c40da1abb21128b5f6900a0355a52
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_hdc.h
@@ -0,0 +1,36 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_HDC_H
+#define TENSORFLOW_NPU_HDC_H
+
+#include <utility>
+
+#include "acl/acl_tdt.h"
+#include "tensorflow/core/framework/tensor.h"
+
+#include "npu_micros.h"
+
+class HdcChannel {
+ public:
+  static tensorflow::Status Create(uint32_t device_id, const std::string& name, std::shared_ptr<HdcChannel> *guarded_channel);
+
+  ~HdcChannel();
+
+  tensorflow::Status SendTensors(const std::vector<tensorflow::Tensor> &tensors);
+
+  tensorflow::Status NotifyFinish();
+
+  tensorflow::Status NotifyAbnormal();
+
+ private:
+  HdcChannel(uint32_t device_id, std::string name);
+  tensorflow::Status Init();
+  acltdtChannelHandle *handle_;
+  int32_t device_id_;
+  std::string name_;
+};
+
+#endif  //TENSORFLOW_NPU_HDC_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_logger.cpp b/tf_adapter_2.x/npu_device/core/npu_logger.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5236e58dd000165006da4cd3089f8b0842abd6db
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_logger.cpp
@@ -0,0 +1,133 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include "npu_logger.h"
+#include "tensorflow/c/eager/c_api.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+
+// clang-format off
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/device_filters.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/shape_inference.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+
+#include "npu_micros.h"
+#include "npu_managed_buffer.h"
+#include "npu_unwrap.h"
+#include "npu_logger.h"
+#include "npu_device.h"
+#include "npu_utils.h"
+
+namespace npu {
+class ProfManager {
+ public:
+  static void RecordOp(const std::string &op, std::string detail, bool is_stateful, bool is_unknown) {
+    Instance().RecordOpInner(op, detail, is_stateful, is_unknown);
+  }
+
+ private:
+  static ProfManager &Instance() {
+    static ProfManager prof;
+    return prof;
+  }
+  void RecordOpInner(const std::string &op, std::string detail, bool is_stateful, bool is_unknown) {
+    std::lock_guard<std::mutex> lk(mu_);
+    op_records_[op]++;
+    if (is_unknown) { unknown_shape_op_records_[op]++; }
+    if (is_stateful) { stateful_shape_op_records_[op]++; }
+    op_shape_records_[op].insert(detail);
+  }
+  ~ProfManager() {
+    std::lock_guard<std::mutex> lk(mu_);
+    LOG(INFO) << "All nodes executed by acl";
+    for (auto iter = op_records_.begin(); iter != op_records_.end(); iter++) {
+      LOG(INFO) << iter->first << ":" << iter->second;
+    }
+
+    LOG(INFO) << "All stateful nodes executed by acl";
+    for (auto iter = stateful_shape_op_records_.begin(); iter != stateful_shape_op_records_.end(); iter++) {
+      LOG(INFO) << iter->first << ":" << iter->second;
+    }
+
+    LOG(INFO) << "All unknown shape nodes executed by acl";
+    for (auto iter = unknown_shape_op_records_.begin(); iter != unknown_shape_op_records_.end(); iter++) {
+      LOG(INFO) << iter->first << ":" << iter->second;
+    }
+
+    LOG(INFO) << "All nodes' shape and type detail executed by acl";
+    for (auto iter = op_shape_records_.begin(); iter != op_shape_records_.end(); iter++) {
+      std::stringstream ss;
+      ss << std::endl << iter->first << ":";
+      for (auto status : iter->second) { ss << std::endl << status; }
+      LOG(INFO) << ss.str();
+    }
+  }
+  ProfManager() = default;
+  std::mutex mu_;
+  std::map<std::string, size_t> op_records_ GUARDED_BY(mu_);
+  std::map<std::string, size_t> unknown_shape_op_records_ GUARDED_BY(mu_);
+  std::map<std::string, size_t> stateful_shape_op_records_ GUARDED_BY(mu_);
+  std::map<std::string, std::set<std::string>> op_shape_records_ GUARDED_BY(mu_);
+};
+}  // namespace npu
\ No newline at end of file
diff --git a/tf_adapter_2.x/npu_device/core/npu_logger.h b/tf_adapter_2.x/npu_device/core/npu_logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ea5c5e5d6ee16d7fc91e161d6d4e05e93aaca31
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_logger.h
@@ -0,0 +1,55 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_LOGGER_H
+#define TENSORFLOW_NPU_LOGGER_H
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/env_var.h"
+
+#include "npu_env.h"
+
+#define DLOG()                                                                                                         \
+  if (kDumpExecutionDetail) LOG(INFO)
+
+namespace npu {
+// TODO:日志适配层，需要对接slog，当前未使用，复用的tensorflow
+class Logger : public std::basic_ostringstream<char> {
+ public:
+  Logger(const char *f, int line) { *this << f << ":" << line << " "; }
+  ~Logger() override { std::cerr << str() << std::endl; }
+};
+
+class Timer : public std::basic_ostringstream<char> {
+ public:
+  template<typename... Args>
+  explicit Timer(Args... args) {
+    *this << tensorflow::strings::StrCat(args...) << " cost ";
+  };
+  void Start() {
+    if (TF_PREDICT_FALSE(kPerfEnabled)) { start_ = tensorflow::Env::Default()->NowMicros(); }
+    started_ = true;
+  }
+  void Stop() {
+    if (started_ && TF_PREDICT_FALSE(kPerfEnabled)) {
+      *this << (tensorflow::Env::Default()->NowMicros() - start_) / 1000 << " ms";
+      LOG(INFO) << str();
+    }
+    started_ = false;
+  }
+
+ private:
+  uint64_t start_{0};
+  bool started_{false};
+};
+}  // namespace npu
+
+#endif  //TENSORFLOW_NPU_DEVICE_ACL_BACKENDS_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_managed_buffer.cpp b/tf_adapter_2.x/npu_device/core/npu_managed_buffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a005607846a9102b1368ee993074c108fdc6c31
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_managed_buffer.cpp
@@ -0,0 +1,314 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include "npu_managed_buffer.h"
+#include "npu_logger.h"
+#include "npu_micros.h"
+#include "npu_utils.h"
+
+#include "acl/acl_op_compiler.h"
+#include "acl/acl_rt.h"
+
+#include "tensorflow/core/common_runtime/dma_helper.h"
+
+namespace {
+class NpuMemory {
+ public:
+  static tensorflow::Status Malloc(size_t size, void **memory) {
+    if (size == 0) {
+      *memory = nullptr;
+      return tensorflow::Status::OK();
+    }
+    NPU_REQUIRES_ACL_OK("Malloc npu memory failed for size " + std::to_string(size),
+                        aclrtMalloc(memory, size, ACL_MEM_MALLOC_HUGE_FIRST));
+    return tensorflow::Status::OK();
+  }
+  static void Free(void *memory, size_t size, void *arg) { aclrtFree(memory); }
+};
+
+class RtsStreamGuard {
+ public:
+  explicit RtsStreamGuard(aclrtStream stream) : stream_(stream) {}
+  ~RtsStreamGuard() {
+    if (stream_ != nullptr) {
+      aclrtDestroyStream(stream_);
+      stream_ = nullptr;
+    }
+  }
+
+ private:
+  aclrtStream stream_;
+};
+
+tensorflow::Status CreateAclTensorDesc(ge::DataType dtype, ge::Format format, const std::vector<int64_t> &shape,
+                                       std::shared_ptr<aclTensorDesc> *desc) {
+  aclDataType acl_dtype;
+  aclFormat acl_format;
+  NPU_REQUIRES_OK(MapGeType2Acl(dtype, &acl_dtype));
+  NPU_REQUIRES_OK(MapGeFormat2Acl(format, &acl_format));
+  aclTensorDesc *acl_desc = aclCreateTensorDesc(acl_dtype, shape.size(), shape.data(), acl_format);
+  NPU_REQUIRES(acl_desc != nullptr, tensorflow::errors::Internal("Failed create acl tensor desc"));
+  desc->reset(acl_desc, [](aclTensorDesc *desc) { aclDestroyTensorDesc(desc); });
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status CreateAclDataBuffer(void *data, size_t size, std::shared_ptr<aclDataBuffer> *buf) {
+  aclDataBuffer *acl_buf = aclCreateDataBuffer(data, size);
+  NPU_REQUIRES(acl_buf != nullptr, tensorflow::errors::Internal("Failed create acl data buffer"));
+  buf->reset(acl_buf, [](aclDataBuffer *buf) { aclDestroyDataBuffer(buf); });
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status CreateTransFormatAttr(ge::Format src, ge::Format dst, std::shared_ptr<aclopAttr> *attr) {
+  aclopAttr *acl_attr = aclopCreateAttr();
+  NPU_REQUIRES(acl_attr != nullptr, tensorflow::errors::Internal("Failed create acl op attr"));
+  attr->reset(acl_attr, [](aclopAttr *attr) { aclopDestroyAttr(attr); });
+
+  NPU_REQUIRES_ACL_OK("Acl set op attr src_format failed",
+                      aclopSetAttrString(acl_attr, "src_format", GetFormatName(src)));
+
+  NPU_REQUIRES_ACL_OK("Acl set op attr dst_format failed",
+                      aclopSetAttrString(acl_attr, "dst_format", GetFormatName(dst)));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status CreateCastDtypeAttr(ge::DataType src, ge::DataType dst, std::shared_ptr<aclopAttr> *attr) {
+  aclopAttr *acl_attr = aclopCreateAttr();
+  NPU_REQUIRES(acl_attr != nullptr, tensorflow::errors::Internal(""));
+  attr->reset(acl_attr, [](aclopAttr *attr) { aclopDestroyAttr(attr); });
+
+  NPU_REQUIRES_ACL_OK("Acl set op attr dst_type failed",
+                      aclopSetAttrInt(acl_attr, "dst_type", static_cast<int32_t>(dst)));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ScheduleCastDtypeTask(aclrtStream stream, ge::Format format, const std::vector<int64_t> &shape,
+                                         ge::DataType src_dt, ge::DataType dst_dt, void *src_data, void *dst_data,
+                                         size_t src_len, size_t dst_len) {
+  // TODO: 在一些cube格式的极端场景下，data type转换后，shape也会跟着转，这里暂时没有考虑这种场景
+  std::shared_ptr<aclTensorDesc> input_desc;
+  NPU_REQUIRES_OK(CreateAclTensorDesc(src_dt, format, shape, &input_desc));
+  aclTensorDesc *input_descs[] = {input_desc.get()};
+
+  std::shared_ptr<aclDataBuffer> input_data;
+  NPU_REQUIRES_OK(CreateAclDataBuffer(src_data, src_len, &input_data));
+  aclDataBuffer *input_dbs[] = {input_data.get()};
+
+  std::shared_ptr<aclTensorDesc> output_desc;
+  NPU_REQUIRES_OK(CreateAclTensorDesc(dst_dt, format, shape, &output_desc));
+  aclTensorDesc *output_ds[] = {output_desc.get()};
+
+  std::shared_ptr<aclDataBuffer> output_data;
+  NPU_REQUIRES_OK(CreateAclDataBuffer(dst_data, dst_len, &output_data));
+  aclDataBuffer *output_dbs[] = {output_data.get()};
+
+  std::shared_ptr<aclopAttr> attr;
+  NPU_REQUIRES_OK(CreateCastDtypeAttr(src_dt, dst_dt, &attr));
+  NPU_REQUIRES_ACL_OK("Acl compile and execute \'Cast\' op failed",
+                      aclopCompileAndExecute("Cast", 1, input_descs, input_dbs, 1, output_ds, output_dbs, attr.get(),
+                                             ACL_ENGINE_AICORE, ACL_COMPILE_SYS, nullptr, stream));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ScheduleTransFormatTask(aclrtStream stream, ge::DataType src_dt, ge::Format src_format,
+                                           const std::vector<int64_t> &src_shape, ge::Format dst_format,
+                                           const std::vector<int64_t> &dst_shape, void *src_data, void *dst_data,
+                                           size_t src_len, size_t dst_len) {
+  std::shared_ptr<aclTensorDesc> input_desc;
+  NPU_REQUIRES_OK(CreateAclTensorDesc(src_dt, src_format, src_shape, &input_desc));
+  aclTensorDesc *input_descs[] = {input_desc.get()};
+
+  std::shared_ptr<aclDataBuffer> input_data;
+  NPU_REQUIRES_OK(CreateAclDataBuffer(src_data, src_len, &input_data));
+  aclDataBuffer *input_dbs[] = {input_data.get()};
+
+  std::shared_ptr<aclTensorDesc> output_desc;
+  NPU_REQUIRES_OK(CreateAclTensorDesc(src_dt, dst_format, dst_shape, &output_desc));
+  aclTensorDesc *output_ds[] = {output_desc.get()};
+
+  std::shared_ptr<aclDataBuffer> output_data;
+  NPU_REQUIRES_OK(CreateAclDataBuffer(dst_data, dst_len, &output_data));
+  aclDataBuffer *output_dbs[] = {output_data.get()};
+
+  std::shared_ptr<aclopAttr> attr;
+  NPU_REQUIRES_OK(CreateTransFormatAttr(src_format, dst_format, &attr));
+  NPU_REQUIRES_ACL_OK("Acl compile and execute \'TransData\' op failed",
+                      aclopCompileAndExecute("TransData", 1, input_descs, input_dbs, 1, output_ds, output_dbs,
+                                             attr.get(), ACL_ENGINE_AICORE, ACL_COMPILE_SYS, nullptr, stream));
+  return tensorflow::Status::OK();
+}
+}  // namespace
+
+NpuManagedBuffer::~NpuManagedBuffer() {
+  if (deallocator_ && size_ > 0) { deallocator_(data_, size_, deallocator_arg_); }
+}
+
+tensorflow::Status NpuManagedBuffer::Create(ge::Format fmt, const tensorflow::TensorShape &shape,
+                                            tensorflow::DataType dtype, NpuManagedBuffer **buf) {
+  std::vector<int64_t> dims;
+  for (auto dim_size : shape.dim_sizes()) { dims.push_back(dim_size); }
+  ge::DataType ge_type;
+  NPU_REQUIRES_OK(MapTfType2Ge(dtype, &ge_type));
+  return Create(fmt, dims, ge_type, buf);
+}
+
+tensorflow::Status NpuManagedBuffer::Create(ge::Format format, const std::vector<int64_t> &dims, ge::DataType data_type,
+                                            NpuManagedBuffer **buf) {
+  return Create(format, dims, data_type, format, dims, buf);
+}
+
+tensorflow::Status NpuManagedBuffer::Create(ge::Format format, const std::vector<int64_t> &shape,
+                                            ge::DataType data_type, ge::Format origin_format,
+                                            const std::vector<int64_t> &origin_shape, NpuManagedBuffer **buf) {
+  size_t total_bytes;
+  int dtype_size = ge::GetSizeByDataType(data_type);
+  NPU_REQUIRES(dtype_size > 0,
+               tensorflow::errors::Internal("Data type size invalid ", dtype_size, " for ge type enum ", data_type));
+  total_bytes = dtype_size;
+  for (auto dim_size : shape) {
+    if (dim_size == 0) {
+      total_bytes = 0;
+      break;
+    }
+    NPU_REQUIRES(dim_size >= 0, tensorflow::errors::InvalidArgument("Dim size invalid for shape ", VecToString(shape)));
+    NPU_REQUIRES(total_bytes <= total_bytes * dim_size,
+                 tensorflow::errors::InvalidArgument("Total bytes overflow for shape ", VecToString(shape)));
+    total_bytes *= dim_size;
+  }
+  void *data = nullptr;
+  NPU_REQUIRES_OK(NpuMemory::Malloc(total_bytes, &data));
+  auto status =
+      Create(format, shape, data_type, origin_format, origin_shape, data, total_bytes, nullptr, NpuMemory::Free, buf);
+  if (!status.ok()) { NpuMemory::Free(data, total_bytes, nullptr); }
+  return status;
+}
+
+tensorflow::Status NpuManagedBuffer::Create(ge::Format format, const std::vector<int64_t> &shape,
+                                            ge::DataType data_type, ge::Format origin_format,
+                                            const std::vector<int64_t> &origin_shape, void *addr, size_t size,
+                                            void *arg, void (*deallocator)(void *, size_t, void *),
+                                            NpuManagedBuffer **buf) {
+  *buf = new (std::nothrow) NpuManagedBuffer();
+  if (*buf == nullptr) { return tensorflow::errors::Internal("Failed malloc host npu buffer handle"); }
+  (*buf)->format_ = format;
+  (*buf)->shape_ = shape;
+  (*buf)->data_type_ = data_type;
+  (*buf)->origin_format_ = origin_format;
+  (*buf)->origin_data_type_ = data_type;
+  (*buf)->origin_shape_ = origin_shape;
+
+  (*buf)->data_ = addr;
+  (*buf)->size_ = size;
+  (*buf)->deallocator_arg_ = arg;
+  (*buf)->deallocator_ = deallocator;
+
+  return tensorflow::Status::OK();
+}
+
+void NpuManagedBuffer::Destroy(NpuManagedBuffer *buf) { delete buf; }
+
+tensorflow::Status NpuManagedBuffer::AssembleTo(const tensorflow::Tensor *tensor) {
+  NPU_REQUIRES(tensor != nullptr,
+               tensorflow::errors::InvalidArgument("Failed assemble npu buffer to cpu as dst cpu tensor is nullptr"));
+  DLOG() << "Npu buffer " << DebugString() << " assemble to " << tensor->DebugString();
+  tensorflow::DataType dtype;
+  NPU_REQUIRES_OK(MapGeType2Tf(origin_data_type_, &dtype));
+  NPU_REQUIRES(dtype == tensor->dtype(),
+               tensorflow::errors::InvalidArgument("Data type mismatch when assemble npu buffer to cpu, npu ",
+                                                   tensorflow::DataTypeString(dtype), " vs. cpu ",
+                                                   tensorflow::DataTypeString(tensor->dtype())));
+  if (size_ == 0) { return tensorflow::Status::OK(); }
+  if (SameRepresentation()) {
+    NPU_REQUIRES_OK(DToH(const_cast<char *>(tensor->tensor_data().data()), tensor->TotalBytes()));
+  } else {
+    NpuManagedBuffer *buf;
+    NPU_REQUIRES_OK(Create(origin_format_, origin_shape_, origin_data_type_, &buf));
+    NpuManagedBuffer::Guarder guarder(buf);
+    NPU_REQUIRES_OK(TransRepresentationOnNpu(buf));
+    buf->DToH(const_cast<char *>(tensor->tensor_data().data()), tensor->TotalBytes());
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status NpuManagedBuffer::AssembleFrom(const tensorflow::Tensor *tensor) {
+  NPU_REQUIRES(tensor != nullptr,
+               tensorflow::errors::InvalidArgument("Failed assemble npu buffer from cpu as dst cpu tensor is nullptr"));
+  DLOG() << "Npu buffer " << DebugString() << " assemble from " << tensor->DebugString();
+  tensorflow::DataType dtype;
+  NPU_REQUIRES_OK(MapGeType2Tf(origin_data_type_, &dtype));
+  NPU_REQUIRES(dtype == tensor->dtype(),
+               tensorflow::errors::InvalidArgument("Data type mismatch when assemble npu buffer from cpu, npu ",
+                                                   tensorflow::DataTypeString(dtype), " vs. cpu ",
+                                                   tensorflow::DataTypeString(tensor->dtype())));
+  if (size_ == 0) { return tensorflow::Status::OK(); }
+  if (SameRepresentation()) {
+    NPU_REQUIRES_OK(HToD(const_cast<char *>(tensor->tensor_data().data()), tensor->TotalBytes()));
+  } else {
+    NpuManagedBuffer *buf;
+    NPU_REQUIRES_OK(Create(origin_format_, origin_shape_, origin_data_type_, &buf));
+    NpuManagedBuffer::Guarder guarder(buf);
+    NPU_REQUIRES_OK(buf->HToD(const_cast<char *>(tensor->tensor_data().data()), tensor->TotalBytes()));
+    NPU_REQUIRES_OK(buf->TransRepresentationOnNpu(this));
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status NpuManagedBuffer::TransRepresentationOnNpu(NpuManagedBuffer *dst_buff) {
+  DLOG() << "Trans representation on npu, format " << GetFormatName(format_) << " to "
+         << GetFormatName(dst_buff->format_) << ", data type " << data_type_ << " to " << dst_buff->data_type_;
+  NPU_REQUIRES(format_ != dst_buff->format_ || data_type_ != dst_buff->data_type_, tensorflow::errors::Internal(""));
+
+  aclrtStream rts = nullptr;
+  NPU_REQUIRES_ACL_OK("Acl create stream failed", aclrtCreateStream(&rts));
+  RtsStreamGuard rts_guard(rts);
+  if (format_ == dst_buff->format_) {
+    NPU_REQUIRES_OK(ScheduleCastDtypeTask(rts, format_, shape_, data_type_, dst_buff->data_type_, data_,
+                                          dst_buff->data_, size_, dst_buff->size_));
+  } else if (data_type_ == dst_buff->data_type_) {
+    NPU_REQUIRES_OK(ScheduleTransFormatTask(rts, data_type_, format_, shape_, dst_buff->format_, dst_buff->shape_,
+                                            data_, dst_buff->data_, size_, dst_buff->size_));
+  } else {
+    NpuManagedBuffer *buf;
+    NPU_REQUIRES_OK(Create(format_, shape_, dst_buff->data_type_, &buf));
+    NpuManagedBuffer::Guarder guarder(buf);
+    NPU_REQUIRES_OK(ScheduleCastDtypeTask(rts, format_, shape_, data_type_, dst_buff->data_type_, data_, buf->data_,
+                                          size_, buf->size_));
+    NPU_REQUIRES_OK(ScheduleTransFormatTask(rts, buf->data_type_, buf->format_, buf->shape_, dst_buff->format_,
+                                            dst_buff->shape_, buf->data_, dst_buff->data_, buf->size_,
+                                            dst_buff->size_));
+  }
+  NPU_REQUIRES_ACL_OK("Acl synchronize stream failed", aclrtSynchronizeStream(rts));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status NpuManagedBuffer::HToD(void *host_data, size_t size) {
+  NPU_REQUIRES(
+      size <= size_,
+      tensorflow::errors::Internal("Failed copy host buffer to npu as size mismatch npu ", size_, " vs. cpu ", size));
+  NPU_REQUIRES_ACL_OK("Acl rt-memcpy host to device failed",
+                      aclrtMemcpy(data_, size_, host_data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status NpuManagedBuffer::DToH(void *host_data, size_t size) {
+  NPU_REQUIRES(
+      size >= size_,
+      tensorflow::errors::Internal("Failed copy npu buffer to host as size mismatch npu ", size_, " vs. cpu ", size));
+  NPU_REQUIRES_ACL_OK("Acl rt-memcpy device to host failed",
+                      aclrtMemcpy(host_data, size, data_, size_, ACL_MEMCPY_DEVICE_TO_HOST));
+  return tensorflow::Status::OK();
+}
+
+std::string NpuManagedBuffer::DebugString() const {
+  std::stringstream ss;
+  tensorflow::DataType origin_type;
+  tensorflow::DataType storage_type;
+  (void) MapGeType2Tf(origin_data_type_, &origin_type);
+  (void) MapGeType2Tf(data_type_, &storage_type);
+  ss << "origin " << GetFormatName(origin_format_) << " " << tensorflow::DataTypeString(origin_type)
+     << VecToString(origin_shape_) << ", storage " << GetFormatName(origin_format_) << " "
+     << tensorflow::DataTypeString(storage_type) << VecToString(shape_);
+  return ss.str();
+}
\ No newline at end of file
diff --git a/tf_adapter_2.x/npu_device/core/npu_managed_buffer.h b/tf_adapter_2.x/npu_device/core/npu_managed_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b140b9a83d555e605313f8dbe8ee0ee3a5a68fd
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_managed_buffer.h
@@ -0,0 +1,82 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_MANAGED_BUFFER_H
+#define TENSORFLOW_NPU_MANAGED_BUFFER_H
+
+#include <memory>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+
+#include "graph/types.h"
+
+class NpuManagedBuffer {
+ public:
+  static void Destroy(NpuManagedBuffer *buf);
+
+  static tensorflow::Status Create(ge::Format fmt, const tensorflow::TensorShape &shape, tensorflow::DataType dtype,
+                                   NpuManagedBuffer **buf);
+  static tensorflow::Status Create(ge::Format format, const std::vector<int64_t> &shape, ge::DataType data_type,
+                                   NpuManagedBuffer **buf);
+  static tensorflow::Status Create(ge::Format format, const std::vector<int64_t> &shape, ge::DataType data_type,
+                                   ge::Format origin_format, const std::vector<int64_t> &origin_shape,
+                                   NpuManagedBuffer **buf);
+  static tensorflow::Status Create(ge::Format format, const std::vector<int64_t> &shape, ge::DataType data_type,
+                                   ge::Format origin_format, const std::vector<int64_t> &origin_shape, void *addr,
+                                   size_t size, void *arg, void (*deallocator)(void *, size_t, void *),
+                                   NpuManagedBuffer **buf);
+
+  // 将输入的CPU Tensor的数据填充到当前buffer管理的NPU内存上，CPU Tensor的格式和type与buffer的成员origin_data_type_和origin_format_一致
+  tensorflow::Status AssembleFrom(const tensorflow::Tensor *tensor);
+
+  // 将当前buffer管理的NPU内存上的数据填充到输入的CPU Tensor的数据地址上，CPU Tensor的格式和type与buffer的成员origin_data_type_和origin_format_一致
+  tensorflow::Status AssembleTo(const tensorflow::Tensor *tensor);
+
+  bool SameRepresentation() { return origin_format_ == format_ && origin_data_type_ == data_type_; }
+
+  std::string DebugString() const;
+
+  class Guarder {
+   public:
+    explicit Guarder(NpuManagedBuffer *buf) : buf_(buf) {}
+    ~Guarder() { NpuManagedBuffer::Destroy(buf_); }
+
+   private:
+    NpuManagedBuffer *buf_;
+  };
+
+ private:
+  NpuManagedBuffer() = default;
+  ~NpuManagedBuffer();
+  tensorflow::Status TransRepresentationOnNpu(NpuManagedBuffer *dst_buff);  // 在NPU上完成从存储到原始的格式和类型转换
+  tensorflow::Status HToD(void *host_data, size_t size);  // 将输入的Host内存搬运到管理的NPU内存上
+  tensorflow::Status DToH(void *host_data, size_t max_len);  // 将管理的NPU内存上的数据搬运到输入的Host内存上
+
+  ge::DataType origin_data_type_{};  // 原始数据类型，即对应的CPU Tensor的数据类型
+  ge::Format origin_format_{};  // 原始内存排布，即对应的CPU Tensor的维度信息，一般都是ND，可能是NCHW或者NHWC
+  std::vector<int64_t> origin_shape_;  // 原始维度信息，即对应的CPU Tensor的原始维度
+  ge::DataType data_type_{};           // 在NPU上的存储数据类型
+  ge::Format format_{};                // 在NPU上的存储格式
+  std::vector<int64_t> shape_;         // 对应NPU上的存储格式的维度值
+
+  size_t size_{};                                  // NPU上占用的内存大小
+  void *data_{};                                   // NPU地址指针
+  void (*deallocator_)(void *, size_t, void *){};  // NP内存的释放函数，内存可能会来自于内存池或者rtMalloc
+  void *deallocator_arg_{};                        // 地址释放时传给释放函数的参数
+};
+
+// NpuManagedBuffer是Host的对象，是CPU Tensor管理的对象，是NPU内存的Host句柄，应当在析构函数中释放NPU内存
+static void NpuManagedBufferDeallocator(void *data, size_t len, void *arg) {
+  NpuManagedBuffer::Destroy(reinterpret_cast<NpuManagedBuffer *>(data));
+}
+
+#endif  //TENSORFLOW_NPU_TENSOR_H
\ No newline at end of file
diff --git a/tf_adapter_2.x/npu_device/core/npu_micros.h b/tf_adapter_2.x/npu_device/core/npu_micros.h
new file mode 100644
index 0000000000000000000000000000000000000000..71f9cb42ee40807404d937d579b6c111aad4dfc1
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_micros.h
@@ -0,0 +1,108 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_MICROS_H
+#define TENSORFLOW_NPU_MICROS_H
+
+#define NPU_CTX_REQUIRES_OK(CTX, ...)                                                                                  \
+  do {                                                                                                                 \
+    CTX->status = (__VA_ARGS__);                                                                                       \
+    if (TF_PREDICT_FALSE(!CTX->status.ok())) {                                                                         \
+      LOG(ERROR) << CTX->status.ToString();                                                                            \
+      return;                                                                                                          \
+    }                                                                                                                  \
+  } while (0)
+
+#define NPU_CTX_REQUIRES(CTX, EXP, STATUS)                                                                             \
+  do {                                                                                                                 \
+    if (!TF_PREDICT_TRUE(EXP)) {                                                                                       \
+      CTX->status = STATUS;                                                                                            \
+      LOG(ERROR) << CTX->status.ToString();                                                                            \
+      return;                                                                                                          \
+    }                                                                                                                  \
+  } while (0)
+
+#define NPU_CTX_REQUIRES_OK_RETURN(CTX, EXP, RET)                                                                      \
+  do {                                                                                                                 \
+    CTX->status = (EXP);                                                                                               \
+    if (TF_PREDICT_FALSE(!CTX->status.ok())) {                                                                         \
+      LOG(ERROR) << CTX->status.ToString();                                                                            \
+      return RET;                                                                                                      \
+    }                                                                                                                  \
+  } while (0)
+
+#define NPU_CTX_REQUIRES_RETURN(CTX, EXP, STATUS, RET)                                                                 \
+  do {                                                                                                                 \
+    if (TF_PREDICT_FALSE(!(EXP))) {                                                                                    \
+      CTX->status = (STATUS);                                                                                          \
+      LOG(ERROR) << CTX->status.ToString();                                                                            \
+      return RET;                                                                                                      \
+    }                                                                                                                  \
+  } while (0)
+
+#define NPU_REQUIRES_OK(...)                                                                                           \
+  do {                                                                                                                 \
+    tensorflow::Status _status = (__VA_ARGS__);                                                                        \
+    if (TF_PREDICT_FALSE(!_status.ok())) {                                                                             \
+      LOG(ERROR) << _status.ToString();                                                                                \
+      return _status;                                                                                                  \
+    }                                                                                                                  \
+  } while (0)
+
+#define NPU_REQUIRES(EXP, STATUS)                                                                                      \
+  do {                                                                                                                 \
+    if (!TF_PREDICT_TRUE((EXP))) {                                                                                     \
+      tensorflow::Status _status = (STATUS);                                                                           \
+      LOG(ERROR) << _status.ToString();                                                                                \
+      return _status;                                                                                                  \
+    }                                                                                                                  \
+  } while (0)
+
+#define NPU_CTX_REQUIRES_GE_OK(CTX, PREFIX, ...)                                                                       \
+  do {                                                                                                                 \
+    ge::Status _status = (__VA_ARGS__);                                                                                \
+    if (TF_PREDICT_FALSE(_status != ge::SUCCESS)) {                                                                    \
+      std::string err_msg = ge::StatusFactory::Instance()->GetErrDesc(_status);                                        \
+      if (err_msg.empty()) { err_msg = "<unknown error> code:" + std::to_string(_status); }                            \
+      CTX->status = tensorflow::errors::Internal(PREFIX, ":", err_msg);                                                \
+      LOG(ERROR) << CTX->status.ToString();                                                                            \
+      return;                                                                                                          \
+    }                                                                                                                  \
+  } while (0)
+
+#define NPU_CTX_REQUIRES_GE_OK_RETURN(CTX, PREFIX, EXP, RET)                                                           \
+  do {                                                                                                                 \
+    ge::Status _status = (EXP);                                                                                        \
+    if (TF_PREDICT_FALSE(_status != ge::SUCCESS)) {                                                                    \
+      std::string err_msg = ge::StatusFactory::Instance()->GetErrDesc(_status);                                        \
+      if (err_msg.empty()) { err_msg = "<unknown error> code:" + std::to_string(_status); }                            \
+      CTX->status = tensorflow::errors::Internal(PREFIX, ":", err_msg);                                                \
+      LOG(ERROR) << CTX->status.ToString();                                                                            \
+      return RET;                                                                                                      \
+    }                                                                                                                  \
+  } while (0)
+
+#define NPU_REQUIRES_ACL_OK(PREFIX, ...)                                                                               \
+  do {                                                                                                                 \
+    auto _status = (__VA_ARGS__);                                                                                      \
+    if (TF_PREDICT_FALSE(_status != ACL_ERROR_NONE)) {                                                                 \
+      return tensorflow::errors::Internal(PREFIX, ":<unknown error> code:", _status);                                  \
+    }                                                                                                                  \
+  } while (0)
+
+#define NPU_LOG_IF_ERROR(...)                                                                                          \
+  do {                                                                                                                 \
+    const ::tensorflow::Status _status = (__VA_ARGS__);                                                                \
+    if (TF_PREDICT_FALSE(!_status.ok())) LOG(ERROR) << _status.ToString();                                             \
+  } while (0)
+
+#define HANDLE_ALL_FORMAT()                                                                                            \
+  HANDLE_FORMAT(Nd)                                                                                                    \
+  HANDLE_FORMAT(Nchw)                                                                                                  \
+  HANDLE_FORMAT(Nc1hwc0)                                                                                               \
+  HANDLE_FORMAT(Fz)                                                                                                    \
+  HANDLE_FORMAT(Hz)
+
+#endif  //TENSORFLOW_NPU_MICROS_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_parser.h b/tf_adapter_2.x/npu_device/core/npu_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae036e1667e751ab77ae4f99d545246d5a65c729
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_parser.h
@@ -0,0 +1,123 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_PARSER_H
+#define TENSORFLOW_NPU_PARSER_H
+
+#include <utility>
+
+#include "npu_types.h"
+#include "npu_unwrap.h"
+#include "npu_utils.h"
+
+#include "graph/types.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+
+namespace {
+const std::string kInputDesc = "input_tensor_desc";
+const std::string kOutputDesc = "output_tensor_desc";
+const std::string kFormat = "serialize_format";
+const std::string kType = "serialize_datatype";
+const std::string kShape = "serialize_shape";
+const std::string kSubGraph = "SubGraph";
+}  // namespace
+
+template <typename T>
+static tensorflow::AttrValue BuildDescAttr(T shapes, TensorDataTypes types) {
+  tensorflow::AttrValue desc_attr;
+  for (size_t i = 0; i < types.size(); i++) {
+    auto desc = desc_attr.mutable_list()->add_func();
+    desc->set_name(std::to_string(i));
+
+    tensorflow::AttrValue shape_value;
+    for (int j = 0; j < shapes[i].dims(); j++) { shape_value.mutable_list()->add_i(shapes[i].dim_size(j)); }
+    desc->mutable_attr()->insert({kShape, shape_value});
+
+    tensorflow::AttrValue type_value;
+    type_value.set_i(static_cast<int64_t>(types[i]));
+    desc->mutable_attr()->insert({kType, type_value});
+
+    tensorflow::AttrValue format_value;
+    format_value.set_i(static_cast<int>(ge::Format::FORMAT_NHWC));
+    desc->mutable_attr()->insert({kFormat, format_value});
+  }
+  return desc_attr;
+}
+
+static void AssembleDesc(TensorPartialShapes shapes, TensorDataTypes types, const std::string &name,
+                         tensorflow::NodeDef *ndef) {
+  tensorflow::AddNodeAttr(name, BuildDescAttr(std::move(shapes), std::move(types)), ndef);
+}
+
+static void AssembleDesc(TensorShapes shapes, TensorDataTypes types, const std::string &name,
+                         tensorflow::NodeDef *ndef) {
+  tensorflow::AddNodeAttr(name, BuildDescAttr(std::move(shapes), std::move(types)), ndef);
+}
+
+static void AssembleInputDesc(TensorPartialShapes shapes, TensorDataTypes types, tensorflow::NodeDef *ndef) {
+  AssembleDesc(std::move(shapes), std::move(types), kInputDesc, ndef);
+}
+
+static void AssembleOutputDesc(TensorPartialShapes shapes, TensorDataTypes types, tensorflow::NodeDef *ndef) {
+  AssembleDesc(std::move(shapes), std::move(types), kOutputDesc, ndef);
+}
+
+static void AssembleInputDesc(TensorShapes shapes, TensorDataTypes types, tensorflow::NodeDef *ndef) {
+  AssembleDesc(std::move(shapes), std::move(types), kInputDesc, ndef);
+}
+
+static void AssembleOutputDesc(TensorShapes shapes, TensorDataTypes types, tensorflow::NodeDef *ndef) {
+  AssembleDesc(std::move(shapes), std::move(types), kOutputDesc, ndef);
+}
+
+static void AssembleInputDesc(TensorShapes shapes, TensorDataTypes types, tensorflow::Node *n) {
+  n->AddAttr(kInputDesc, BuildDescAttr(std::move(shapes), std::move(types)));
+}
+
+static void AssembleOutputDesc(TensorShapes shapes, TensorDataTypes types, tensorflow::Node *n) {
+  n->AddAttr(kOutputDesc, BuildDescAttr(std::move(shapes), std::move(types)));
+}
+
+static void AssembleInputDesc(TensorPartialShapes shapes, TensorDataTypes types, tensorflow::Node *n) {
+  n->AddAttr(kInputDesc, BuildDescAttr(std::move(shapes), std::move(types)));
+}
+
+static void AssembleOutputDesc(TensorPartialShapes shapes, TensorDataTypes types, tensorflow::Node *n) {
+  n->AddAttr(kOutputDesc, BuildDescAttr(std::move(shapes), std::move(types)));
+}
+
+static void AssembleOpDef(const tensorflow::OpRegistrationData *op_data, tensorflow::Node *n) {
+  std::string serialized_op_def;
+  op_data->op_def.SerializeToString(&serialized_op_def);
+  n->AddAttr("op_def", serialized_op_def);
+}
+
+static void AssembleOpDef(tensorflow::Node *n) {
+  const tensorflow::OpRegistrationData *op_reg_data;
+  tensorflow::OpRegistry::Global()->LookUp(n->type_string(), &op_reg_data);
+  std::string serialized_op_def;
+  op_reg_data->op_def.SerializeToString(&serialized_op_def);
+  n->AddAttr("op_def", serialized_op_def);
+}
+
+static void AssembleOpDef(const tensorflow::OpRegistrationData *op_data, tensorflow::NodeDef *ndef) {
+  std::string serialized_op_def;
+  op_data->op_def.SerializeToString(&serialized_op_def);
+  tensorflow::AddNodeAttr("op_def", serialized_op_def, ndef);
+}
+
+static void AssembleOpDef(tensorflow::NodeDef *ndef) {
+  const tensorflow::OpRegistrationData *op_reg_data;
+  tensorflow::OpRegistry::Global()->LookUp(ndef->op(), &op_reg_data);
+  std::string serialized_op_def;
+  op_reg_data->op_def.SerializeToString(&serialized_op_def);
+  tensorflow::AddNodeAttr("op_def", serialized_op_def, ndef);
+}
+
+#endif  //TENSORFLOW_NPU_PARSER_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_types.h b/tf_adapter_2.x/npu_device/core/npu_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f13cb212b7c5fb243d1f5bbca2fd618a4b8500a
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_types.h
@@ -0,0 +1,24 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_TYPES_H
+#define TENSORFLOW_NPU_TYPES_H
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+
+using TensorPartialShapes = tensorflow::gtl::InlinedVector<tensorflow::PartialTensorShape, 4>;
+using TensorShapes = tensorflow::gtl::InlinedVector<tensorflow::TensorShape, 4>;
+using TensorDataTypes = tensorflow::gtl::InlinedVector<tensorflow::DataType, 4>;
+
+using VecTensorPartialShapes = tensorflow::gtl::InlinedVector<TensorPartialShapes, 4>;
+using VecTensorShapes = tensorflow::gtl::InlinedVector<TensorShapes, 4>;
+using VecTensorDataTypes = tensorflow::gtl::InlinedVector<TensorDataTypes, 4>;
+
+const static tensorflow::TensorShape kScalarShape;
+
+#endif  //TENSORFLOW_NPU_TYPES_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_unwrap.h b/tf_adapter_2.x/npu_device/core/npu_unwrap.h
new file mode 100644
index 0000000000000000000000000000000000000000..41e7fed80f6916e15a809fb5def1c64e352d2151
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_unwrap.h
@@ -0,0 +1,87 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_UNWRAP_H
+#define TENSORFLOW_NPU_UNWRAP_H
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/shape_inference.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/device_filters.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/env_var.h"
+
+#include "npu_managed_buffer.h"
+
+namespace npu {
+template<typename T>
+static NpuManagedBuffer *Unwrap(const tensorflow::Tensor *tensor) {
+  return reinterpret_cast<T *>(const_cast<char *>(tensor->tensor_data().data()));
+}
+
+static tensorflow::EagerContext *UnwrapCtx(TFE_Context *context) {
+  return tensorflow::ContextFromInterface(tensorflow::unwrap(context));
+}
+
+static tensorflow::TensorHandle *UnwrapHandle(TFE_TensorHandle *tensor_handle) {
+  return tensorflow::TensorHandleFromInterface(tensorflow::unwrap(tensor_handle));
+}
+
+static tensorflow::EagerOperation *UnwrapOp(TFE_Op *op) {
+  return reinterpret_cast<tensorflow::EagerOperation *>(tensorflow::unwrap(op));
+}
+
+static tensorflow::Status UnwrapTensor(TFE_TensorHandle *tensor_handle, const tensorflow::Tensor **tensor) {
+  return UnwrapHandle(tensor_handle)->Tensor(tensor);
+}
+
+}  // namespace npu
+
+#endif  //TENSORFLOW_NPU_UNWRAP_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_utils.cpp b/tf_adapter_2.x/npu_device/core/npu_utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..92ecc3150fd3eb0f9fd4783ca53825d11a260ec4
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_utils.cpp
@@ -0,0 +1,6 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include "npu_utils.h"
diff --git a/tf_adapter_2.x/npu_device/core/npu_utils.h b/tf_adapter_2.x/npu_device/core/npu_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a44d77577566c8c8f0dcfd157b1c83bb24dfbc1
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_utils.h
@@ -0,0 +1,222 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#ifndef TENSORFLOW_NPU_UTILS_H
+#define TENSORFLOW_NPU_UTILS_H
+
+#include "tensorflow/c/eager/c_api.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+
+// clang-format off
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/device_filters.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/shape_inference.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+
+#include "npu_env.h"
+#include "npu_micros.h"
+#include "npu_unwrap.h"
+
+#include "acl/acl_base.h"
+#include "graph/types.h"
+
+static bool IsNpuTensorHandle(tensorflow::TensorHandle *handle) {
+  tensorflow::Status status;
+  tensorflow::DeviceNameUtils::ParsedName parsed_name;
+  return tensorflow::DeviceNameUtils::ParseFullName(handle->DeviceName(&status), &parsed_name)
+      && parsed_name.type == "NPU";
+}
+
+static bool IsCpuTensorHandle(tensorflow::TensorHandle *handle) {
+  tensorflow::Status status;
+  tensorflow::DeviceNameUtils::ParsedName parsed_name;
+  return tensorflow::DeviceNameUtils::ParseFullName(handle->DeviceName(&status), &parsed_name)
+      && parsed_name.type == "CPU";
+}
+
+static tensorflow::Status MapGeType2Tf(ge::DataType ge_type, tensorflow::DataType *tf_type) {
+  static std::map<ge::DataType, tensorflow::DataType> kGeType2Tf = {
+      {ge::DT_FLOAT, tensorflow::DT_FLOAT},           {ge::DT_DOUBLE, tensorflow::DT_DOUBLE},
+      {ge::DT_INT32, tensorflow::DT_INT32},           {ge::DT_UINT8, tensorflow::DT_UINT8},
+      {ge::DT_INT16, tensorflow::DT_INT16},           {ge::DT_INT8, tensorflow::DT_INT8},
+      {ge::DT_STRING, tensorflow::DT_STRING},         {ge::DT_COMPLEX64, tensorflow::DT_COMPLEX64},
+      {ge::DT_INT64, tensorflow::DT_INT64},           {ge::DT_BOOL, tensorflow::DT_BOOL},
+      {ge::DT_QINT8, tensorflow::DT_QINT8},           {ge::DT_QUINT8, tensorflow::DT_QUINT8},
+      {ge::DT_QINT32, tensorflow::DT_QINT32},         {ge::DT_QINT16, tensorflow::DT_QINT16},
+      {ge::DT_QUINT16, tensorflow::DT_QUINT16},       {ge::DT_UINT16, tensorflow::DT_UINT16},
+      {ge::DT_COMPLEX128, tensorflow::DT_COMPLEX128}, {ge::DT_RESOURCE, tensorflow::DT_RESOURCE},
+      {ge::DT_VARIANT, tensorflow::DT_VARIANT},       {ge::DT_UINT32, tensorflow::DT_UINT32},
+      {ge::DT_UINT64, tensorflow::DT_UINT64},         {ge::DT_STRING_REF, tensorflow::DT_STRING_REF},
+      {ge::DT_FLOAT16, tensorflow::DT_HALF},
+  };
+  if (kGeType2Tf.find(ge_type) == kGeType2Tf.end()) {
+    return tensorflow::errors::InvalidArgument("Unsupport ge data type enmu value ", ge_type, " by tf");
+  }
+  *tf_type = kGeType2Tf[ge_type];
+  return tensorflow::Status::OK();
+}
+
+static tensorflow::Status MapTfType2Ge(tensorflow::DataType tf_type, ge::DataType *ge_type) {
+  static std::map<tensorflow::DataType, ge::DataType> kTfType2Ge = {
+      {tensorflow::DT_FLOAT, ge::DT_FLOAT},           {tensorflow::DT_DOUBLE, ge::DT_DOUBLE},
+      {tensorflow::DT_INT32, ge::DT_INT32},           {tensorflow::DT_UINT8, ge::DT_UINT8},
+      {tensorflow::DT_INT16, ge::DT_INT16},           {tensorflow::DT_INT8, ge::DT_INT8},
+      {tensorflow::DT_STRING, ge::DT_STRING},         {tensorflow::DT_COMPLEX64, ge::DT_COMPLEX64},
+      {tensorflow::DT_INT64, ge::DT_INT64},           {tensorflow::DT_BOOL, ge::DT_BOOL},
+      {tensorflow::DT_QINT8, ge::DT_QINT8},           {tensorflow::DT_QUINT8, ge::DT_QUINT8},
+      {tensorflow::DT_QINT32, ge::DT_QINT32},         {tensorflow::DT_QINT16, ge::DT_QINT16},
+      {tensorflow::DT_QUINT16, ge::DT_QUINT16},       {tensorflow::DT_UINT16, ge::DT_UINT16},
+      {tensorflow::DT_COMPLEX128, ge::DT_COMPLEX128}, {tensorflow::DT_RESOURCE, ge::DT_RESOURCE},
+      {tensorflow::DT_VARIANT, ge::DT_VARIANT},       {tensorflow::DT_UINT32, ge::DT_UINT32},
+      {tensorflow::DT_UINT64, ge::DT_UINT64},         {tensorflow::DT_STRING_REF, ge::DT_STRING_REF},
+      {tensorflow::DT_HALF, ge::DT_FLOAT16},
+  };
+  if (kTfType2Ge.find(tf_type) == kTfType2Ge.end()) {
+    return tensorflow::errors::InvalidArgument("Unsupport tf data type enmu value ", ge_type, " by ge");
+  }
+  *ge_type = kTfType2Ge[tf_type];
+  return tensorflow::Status::OK();
+}
+
+static tensorflow::Status MapGeType2Acl(ge::DataType ge_type, aclDataType *acl_type) {
+  static std::map<ge::DataType, aclDataType> kGeType2Acl = {
+      {ge::DT_FLOAT, ACL_FLOAT},     {ge::DT_DOUBLE, ACL_DOUBLE}, {ge::DT_INT32, ACL_INT32},
+      {ge::DT_UINT8, ACL_UINT8},     {ge::DT_INT16, ACL_INT16},   {ge::DT_INT8, ACL_INT8},
+      {ge::DT_STRING, ACL_STRING},   {ge::DT_INT64, ACL_INT64},   {ge::DT_BOOL, ACL_BOOL},
+      {ge::DT_UINT16, ACL_UINT16},   {ge::DT_UINT32, ACL_UINT32}, {ge::DT_UINT64, ACL_UINT64},
+      {ge::DT_FLOAT16, ACL_FLOAT16},
+  };
+  if (kGeType2Acl.find(ge_type) == kGeType2Acl.end()) {
+    return tensorflow::errors::InvalidArgument("Unsupport ge data type enmu value ", ge_type, " by acl");
+  }
+  *acl_type = kGeType2Acl[ge_type];
+  return tensorflow::Status::OK();
+}
+
+static tensorflow::Status MapGeFormat2Acl(ge::Format ge_format, aclFormat *acl_format) {
+  static std::map<ge::Format, aclFormat> kGeFormat2Acl = {{ge::Format::FORMAT_NCHW, ACL_FORMAT_NCHW},
+                                                          {ge::Format::FORMAT_NHWC, ACL_FORMAT_NHWC},
+                                                          {ge::Format::FORMAT_ND, ACL_FORMAT_ND},
+                                                          {ge::Format::FORMAT_NC1HWC0, ACL_FORMAT_NC1HWC0},
+                                                          {ge::Format::FORMAT_FRACTAL_Z, ACL_FORMAT_FRACTAL_Z},
+                                                          {ge::Format::FORMAT_NC1HWC0_C04, ACL_FORMAT_NC1HWC0_C04},
+                                                          {ge::Format::FORMAT_NDHWC, ACL_FORMAT_NDHWC},
+                                                          {ge::Format::FORMAT_FRACTAL_NZ, ACL_FORMAT_FRACTAL_NZ},
+                                                          {ge::Format::FORMAT_NCDHW, ACL_FORMAT_NCDHW},
+                                                          {ge::Format::FORMAT_NDC1HWC0, ACL_FORMAT_NDC1HWC0},
+                                                          {ge::Format::FORMAT_FRACTAL_Z_3D, ACL_FRACTAL_Z_3D}};
+  if (kGeFormat2Acl.find(ge_format) == kGeFormat2Acl.end()) {
+    return tensorflow::errors::InvalidArgument("Unsupport ge format enmu value ", ge_format, " by acl");
+  }
+  *acl_format = kGeFormat2Acl[ge_format];
+  return tensorflow::Status::OK();
+}
+
+// specify the template in utils.cpp if need
+template<typename T>
+std::string ToString(T v) {
+  return std::to_string(v);
+}
+
+template<typename T>
+std::string VecToString(std::vector<T> vec) {
+  if (vec.empty()) { return "[]"; }
+  std::string s = "[";
+  for (size_t i = 0; i < vec.size(); ++i) {
+    s += ToString(vec[i]);
+    if (i != vec.size() - 1) { s += ","; }
+  }
+  return s + "]";
+}
+
+// TODO:在GE处理中，变量名称作为唯一标识，对于shared_name是"_"开头的变量，由于tensorflow禁止变量名以"_"开头，所以无法直接将shared_name
+//  作为Node的name，对于GE，则没有这个限制，因而，这个函数需要能够屏蔽这种差异。
+static std::string WrapResourceName(const std::string &name) {
+  if (kCustomKernelEnabled) { return name; }
+  return "cpu_" + name;
+}
+
+static tensorflow::Status LoadGraphDefProto(const std::string &file, tensorflow::GraphDef *def) {
+  tensorflow::Status status = tensorflow::Env::Default()->FileExists(file);
+  if (!status.ok()) { return status; }
+  if (tensorflow::Env::Default()->IsDirectory(file).ok()) {
+    return tensorflow::errors::InvalidArgument(file, " is directory");
+  }
+  if (tensorflow::str_util::EndsWith(file, ".pb")) {
+    ReadBinaryProto(tensorflow::Env::Default(), file, def);
+  } else if (tensorflow::str_util::EndsWith(file, ".pbtxt")) {
+    ReadTextProto(tensorflow::Env::Default(), file, def);
+  } else {
+    return tensorflow::errors::InvalidArgument(file, " must ends with .pb or .pbtxt");
+  }
+  return tensorflow::Status::OK();
+}
+
+struct ResourceCompare {
+  bool operator()(const tensorflow::ResourceHandle &left, const tensorflow::ResourceHandle &right) const {
+    return left.name() < right.name() || left.container() < right.container() || left.device() < right.device();
+  }
+};
+
+#endif  //TENSORFLOW_NPU_UTILS_H
diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5106b4bcc31f4b2d40717812678c81b91c794b5d
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
@@ -0,0 +1,111 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include <memory>
+
+#include "Python.h"
+#include "pybind11/chrono.h"
+#include "pybind11/complex.h"
+#include "pybind11/functional.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/dlpack.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/python/eager/pywrap_tensor_conversion.h"
+#include "tensorflow/python/eager/pywrap_tfe.h"
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/util/util.h"
+
+#include "framework/omg/omg_inner_types.h"
+#include "framework/omg/parser/parser_api.h"
+#include "ge/ge_api.h"
+
+#include "npu_device_register.h"
+
+namespace py = pybind11;
+
+namespace {
+TFE_Context *InputTFE_Context(const py::handle &ctx) {
+  return static_cast<TFE_Context *>(PyCapsule_GetPointer(ctx.ptr(), nullptr));
+}
+std::atomic_bool graph_engine_started{false};
+const std::string kTrain = "1";
+const std::string kOpen = "1";
+}  // namespace
+
+PYBIND11_MODULE(_npu_device_backends, m) {
+  m.def("Open",
+        [](const py::handle &context, const char *device_name, int device_index,
+           std::map<std::string, std::string> global_options,
+           std::map<std::string, std::string> session_options) -> std::string {
+          pybind11::gil_scoped_release release;
+          if (!graph_engine_started.exchange(true)) {
+            // 只允许在train模式下工作
+            global_options[ge::OPTION_GRAPH_RUN_MODE] = kTrain;
+            global_options[ge::OPTION_EXEC_DEVICE_ID] = std::to_string(device_index);
+            if (global_options.find(ge::PRECISION_MODE) == global_options.end()) {
+              global_options[ge::PRECISION_MODE] = "allow_mix_precision";
+            }
+            LOG(INFO) << "Start graph engine with options:";
+            for (const auto &option : global_options) { LOG(INFO) << "  " << option.first << ":" << option.second; }
+            auto ge_status = ge::GEInitialize(global_options);
+            if (ge_status != ge::SUCCESS) {
+              return "Failed start graph engine:" + ge::StatusFactory::Instance()->GetErrDesc(ge_status);
+            }
+            LOG(INFO) << "Start graph engine succeed";
+            ge_status = ge::ParserInitialize(global_options);
+            if (ge_status != ge::SUCCESS) {
+              return "Failed start tensorflow model parser:" + ge::StatusFactory::Instance()->GetErrDesc(ge_status);
+            }
+            LOG(INFO) << "Start tensorflow model parser succeed";
+          }
+
+          std::string full_name = tensorflow::strings::StrCat(device_name, ":", device_index);
+          tensorflow::DeviceNameUtils::ParsedName parsed_name;
+          if (!tensorflow::DeviceNameUtils::ParseFullName(full_name, &parsed_name)) {
+            return "Invalid npu device name " + full_name;
+          }
+
+          LOG(INFO) << "Create device instance " << full_name << " with options:";
+          for (const auto &option : session_options) { LOG(INFO) << "  " << option.first << ":" << option.second; }
+          auto status = CreateDevice(InputTFE_Context(context), full_name.c_str(), device_index, session_options);
+          pybind11::gil_scoped_acquire acquire;
+          return status;
+        });
+
+  m.def("Close", []() {
+    pybind11::gil_scoped_release release;
+    ReleaseDeviceResource();
+    if (graph_engine_started.exchange(false)) {
+      auto ge_status = ge::ParserFinalize();
+      if (ge_status != ge::SUCCESS) {
+        LOG(ERROR) << "Failed stop tensorflow model parser:" << ge::StatusFactory::Instance()->GetErrDesc(ge_status);
+      } else {
+        LOG(INFO) << "Stop tensorflow model parser succeed";
+      }
+      ge_status = ge::GEFinalize();
+      if (ge_status != ge::SUCCESS) {
+        LOG(ERROR) << "Failed stop graph engine:" << ge::StatusFactory::Instance()->GetErrDesc(ge_status);
+      } else {
+        LOG(INFO) << "Stop graph engine succeed";
+      }
+    }
+    pybind11::gil_scoped_acquire acquire;
+  });
+
+  m.def("StupidRepeat", [](const char *device_name, int times) {
+    for (int i = 0; i < times; i++) { LOG(INFO) << device_name; }
+  });
+};
diff --git a/tf_adapter_2.x/npu_device/kernels/anonymous_iterator.cpp b/tf_adapter_2.x/npu_device/kernels/anonymous_iterator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26e86ca4baf9b4a081dbb5c44be33128cd061c87
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/kernels/anonymous_iterator.cpp
@@ -0,0 +1,50 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+
+#include "npu_custom_kernel.h"
+
+static auto kernel = [](TFE_Context *context, NpuDevice *dev, const char *op_name, const TFE_OpAttrs *attributes,
+                        int num_inputs, TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs,
+                        TF_Status *status) {
+  for (int i = 0; i < num_outputs; ++i) {
+    TFE_TensorHandle *retval = outputs[i];
+    if (npu::UnwrapHandle(retval)->DataType() == tensorflow::DT_RESOURCE) {
+      const tensorflow::Tensor *tensor;
+      NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(retval, &tensor));
+      std::vector<tensorflow::PartialTensorShape> vec_shapes;
+      TensorPartialShapes shapes;
+      TensorDataTypes types;
+      tensorflow::NodeDef ndef;
+      tensorflow::unwrap(attributes)->FillAttrValueMap(ndef.mutable_attr());
+      NPU_CTX_REQUIRES_OK(status, tensorflow::GetNodeAttr(ndef, "output_shapes", &vec_shapes));
+      NPU_CTX_REQUIRES_OK(status, tensorflow::GetNodeAttr(ndef, "output_types", &types));
+      for (const auto &shape : vec_shapes) { shapes.push_back(shape); }
+      auto resource = tensor->scalar<tensorflow::ResourceHandle>()();
+      DLOG() << "Record mirrored host resource " << resource.DebugString();
+      dev->RecordIteratorMirror(resource, shapes, types);
+    }
+  }
+};
+
+NPU_REGISTER_FALLBACK_HOOK("AnonymousIteratorV2", kernel);
+NPU_REGISTER_FALLBACK_HOOK("AnonymousIterator", kernel);
+NPU_REGISTER_FALLBACK_HOOK("AnonymousMultiDeviceIterator", kernel);
\ No newline at end of file
diff --git a/tf_adapter_2.x/npu_device/kernels/iterator_h2d.cpp b/tf_adapter_2.x/npu_device/kernels/iterator_h2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..87a5b875c9e477d37ab54a7ac6600cbae4a16fca
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/kernels/iterator_h2d.cpp
@@ -0,0 +1,70 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/util/env_var.h"
+
+#include "npu_hdc.h"
+
+using namespace tensorflow;
+
+class IteratorH2D : public OpKernel {
+ public:
+  explicit IteratorH2D(OpKernelConstruction *ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("channel_name", &channel_name_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ids", &device_ids_));
+  }
+
+  void Compute(OpKernelContext *ctx) override {
+    if (!initialized_.exchange(true)) {
+      std::stringstream ss;
+      for (auto device_id : device_ids_) { ss << device_id << " "; }
+      channels_.resize(device_ids_.size());
+      for (size_t i = 0; i < device_ids_.size(); i++) {
+        OP_REQUIRES_OK(ctx, HdcChannel::Create(device_ids_[i], channel_name_, &channels_[i]));
+      }
+      LOG(INFO) << "Hdc channel for iterator resource " << channel_name_ << " to device ["
+                << ss.str().substr(0, ss.str().size() - 1) << "] created";
+    }
+
+    data::IteratorResource *iterator;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
+    core::ScopedUnref unref_iterator(iterator);
+    std::vector<Tensor> components;
+    bool end_of_sequence = false;
+
+    Status status = iterator->GetNext(ctx, &components, &end_of_sequence);
+
+    if (!status.ok()) {
+      for (auto channel : channels_) { OP_REQUIRES_OK(ctx, channel->NotifyAbnormal()); }
+      ctx->SetStatus(status);
+      return;
+    } else if (end_of_sequence) {
+      for (auto channel : channels_) { OP_REQUIRES_OK(ctx, channel->NotifyFinish()); }
+      ctx->SetStatus(errors::OutOfRange("Iterator resource ", channel_name_, " reach end of sequence"));
+      return;
+    }
+
+    for (auto channel : channels_) { OP_REQUIRES_OK(ctx, channel->SendTensors(components)); }
+  }
+
+ private:
+  std::string channel_name_;
+  std::vector<int> device_ids_;
+  std::vector<std::shared_ptr<HdcChannel>> channels_;
+  std::atomic_bool initialized_{false};
+};
+
+REGISTER_KERNEL_BUILDER(Name("IteratorH2D").Device(DEVICE_CPU).Priority(3), IteratorH2D);
diff --git a/tf_adapter_2.x/npu_device/kernels/make_iterator.cpp b/tf_adapter_2.x/npu_device/kernels/make_iterator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8eab3de312a1d69a002d81bdd6ee853b8a276eed
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/kernels/make_iterator.cpp
@@ -0,0 +1,119 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+
+#include "npu_custom_kernel.h"
+#include "npu_utils.h"
+
+class MakeIteratorGraphBuilder {
+ public:
+  static tensorflow::GraphDef GetGraph(std::string container_name, std::string shared_name, TensorPartialShapes shapes,
+                                       TensorDataTypes types, TF_Status *status) {
+    tensorflow::GraphDef gdef;
+
+    tensorflow::Graph graph(tensorflow::OpRegistry::Global());
+    tensorflow::Node *device_queue;
+    tensorflow::Node *make_iterator;
+    tensorflow::Node *iterator_v2;
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder("DeviceQueue_" + shared_name, "DeviceQueueDataset")
+                                   .Attr("channel_name", shared_name)
+                                   .Attr("output_types", types)
+                                   .Attr("output_shapes", shapes)
+                                   .Attr("_iterator_name", shared_name)
+                                   .Finalize(&graph, &device_queue),
+                               gdef);
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder(WrapResourceName(shared_name), "IteratorV2")
+                                   .Attr("container", container_name)
+                                   .Attr("shared_name", shared_name)
+                                   .Attr("output_types", types)
+                                   .Attr("output_shapes", shapes)
+                                   .Finalize(&graph, &iterator_v2),
+                               gdef);
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder("InitMakeIterator_" + shared_name, "MakeIterator")
+                                   .Attr("_kernel", "dp")
+                                   .Attr("_iterator_name", shared_name)
+                                   .Input(device_queue, 0)
+                                   .Input(iterator_v2, 0)
+                                   .Finalize(&graph, &make_iterator),
+                               gdef);
+
+    // TODO:Tensorflow model parser bug，如果名字不是dpop开头的，则会被remove掉
+    std::string func_name = "dpop_init_func_" + shared_name;
+    tensorflow::FunctionDefLibrary fdef_lib;
+    tensorflow::FunctionDef *fdef = fdef_lib.add_function();
+    tensorflow::GraphToFunctionDef(graph, func_name, fdef);
+
+    tensorflow::Graph dpop_graph(tensorflow::OpRegistry::Global());
+
+    tensorflow::AttrValue function_attr;
+    function_attr.mutable_func()->set_name(func_name);
+
+    tensorflow::Node *dpop_node;
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder(func_name, "DPOP")
+                                   .Input(std::vector<tensorflow::NodeBuilder::NodeOut>{})
+                                   .Attr("Tin", tensorflow::DataTypeVector{})
+                                   .Attr("Tout", tensorflow::DataTypeVector{})
+                                   .Attr("function", function_attr)
+                                   .Finalize(&dpop_graph, &dpop_node),
+                               gdef);
+    AssembleOpDef(dpop_node);
+    dpop_node->AddAttr("func_def", fdef_lib.SerializeAsString());
+    tensorflow::FixupSourceAndSinkEdges(&dpop_graph);
+    dpop_graph.ToGraphDef(&gdef);
+    return gdef;
+  }
+};
+
+static auto kernel = [](TFE_Context *context, NpuDevice *dev, const char *op_name, const TFE_OpAttrs *attributes,
+                        int num_inputs, TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs,
+                        TF_Status *status) {
+  for (int j = 0; j < num_inputs; ++j) {
+    TFE_TensorHandle *input = inputs[j];
+    if (npu::UnwrapHandle(input)->DataType() == tensorflow::DT_RESOURCE) {
+      const tensorflow::Tensor *tensor;
+      NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(input, &tensor));
+      auto handle = tensor->scalar<tensorflow::ResourceHandle>()();
+      TensorPartialShapes shapes;
+      TensorDataTypes types;
+      NPU_CTX_REQUIRES_OK(status, dev->GetMirroredIteratorShapesAndTypes(handle, shapes, types));
+      auto dp_init_graph = MakeIteratorGraphBuilder::GetGraph(handle.container(), handle.name(), shapes, types, status);
+      if (TF_GetCode(status) != TF_OK) return;
+      if (kDumpExecutionDetail && kDumpGraph) {
+        std::string file_name = "dp_init_" + handle.name() + ".pbtxt";
+        LOG(INFO) << "NPU Dump mirrored resource init graph to: " << file_name;
+        WriteTextProto(tensorflow::Env::Default(), file_name, dp_init_graph);
+      }
+      dev->RunGeGraphPin2CpuAnonymous(context, "dp_init_" + handle.name(), dp_init_graph, num_inputs, inputs, 0,
+                                      nullptr, status);
+      if (TF_GetCode(status) != TF_OK) return;
+      // TODO:针对推荐网络，Provider需要支持1对N的传输，默认只向资源所处的Device发送
+      dev->CreateIteratorProvider(context, tensor, {dev->device_id}, status);
+      if (TF_GetCode(status) != TF_OK) return;
+    }
+  }
+};
+
+NPU_REGISTER_FALLBACK_HOOK("MakeIterator", kernel);
+NPU_REGISTER_FALLBACK_HOOK("MultiDeviceIteratorInit", kernel);
\ No newline at end of file
diff --git a/tf_adapter_2.x/npu_device/kernels/read_variable_op.cpp b/tf_adapter_2.x/npu_device/kernels/read_variable_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6329f8a311e5e9b2290061e2825985b56c52cbb2
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/kernels/read_variable_op.cpp
@@ -0,0 +1,104 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+
+#include "npu_custom_kernel.h"
+#include "npu_utils.h"
+
+class ReadVariableGraphBuilder {
+ public:
+  static tensorflow::GraphDef GetGraph(const tensorflow::ResourceHandle resource, TF_Status *status) {
+    const std::string &container_name = resource.container();
+    const std::string &shared_name = resource.name();
+
+    TensorDataTypes handle_dtyes;
+    TensorPartialShapes handle_shapes;
+    const auto &dtypes_and_shapes = resource.dtypes_and_shapes();
+
+    for (auto &dtype_and_shape : dtypes_and_shapes) {
+      handle_dtyes.push_back(dtype_and_shape.dtype);
+      handle_shapes.push_back(dtype_and_shape.shape);
+    }
+
+    tensorflow::GraphDef gdef;
+
+    tensorflow::Graph graph(tensorflow::OpRegistry::Global());
+    tensorflow::Node *variable;
+    tensorflow::Node *read_variable;
+    tensorflow::Node *retval;
+
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder(WrapResourceName(shared_name), "VarHandleOp")
+                                   .Attr("container", container_name)
+                                   .Attr("shared_name", shared_name)
+                                   .Attr("dtype", handle_dtyes.front())
+                                   .Attr("shape", handle_shapes.front())
+                                   .Finalize(&graph, &variable),
+                               gdef);
+
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder("Read_" + shared_name, "ReadVariableOp")
+                                   .Input(variable, 0)
+                                   .Attr("dtype", handle_dtyes.front())
+                                   .Finalize(&graph, &read_variable),
+                               gdef);
+
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder("Read_" + shared_name + "_Retval", "_Retval")
+                                   .Input(read_variable, 0)
+                                   .Attr("index", 0)
+                                   .Finalize(&graph, &retval),
+                               gdef);
+
+    AssembleOpDef(variable);
+    AssembleOpDef(read_variable);
+
+    AssembleOutputDesc(TensorShapes({kScalarShape}), {tensorflow::DT_RESOURCE}, variable);
+    AssembleInputDesc(TensorShapes({kScalarShape}), {tensorflow::DT_RESOURCE}, read_variable);
+    AssembleOutputDesc(handle_shapes, handle_dtyes, read_variable);
+
+    graph.ToGraphDef(&gdef);
+    return gdef;
+  }
+};
+
+static auto kernel = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec,
+                        const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef, int num_inputs,
+                        TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) {
+  const tensorflow::Tensor *handle = nullptr;
+  NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(inputs[0], &handle));
+  auto resource = handle->scalar<tensorflow::ResourceHandle>()();
+  NPU_CTX_REQUIRES(status, resource.dtypes_and_shapes().size() == 1,
+                   tensorflow::errors::Internal(resource.DebugString(), " type and shape size invalid ",
+                                                resource.dtypes_and_shapes().size(), " expect 1"));
+  auto var_read_graph = ReadVariableGraphBuilder::GetGraph(resource, status);
+  if (TF_GetCode(status) != TF_OK) { return; }
+  std::string graph_name = "ReadVariableOp_" + resource.name();
+  if (kDumpExecutionDetail && kDumpGraph) {
+    std::string file_name = graph_name + ".pbtxt";
+    WriteTextProto(tensorflow::Env::Default(), file_name, var_read_graph);
+    LOG(INFO) << "NPU Dump variable resource init graph to: " << file_name;
+  }
+
+  dev->RunGeGraphPin2CpuAnonymous(context, graph_name, var_read_graph, 0, nullptr, num_outputs, outputs, status);
+};
+
+NPU_REGISTER_CUSTOM_KERNEL("ReadVariableOp", kernel);
diff --git a/tf_adapter_2.x/npu_device/kernels/resource_variable_op.cpp b/tf_adapter_2.x/npu_device/kernels/resource_variable_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66d0afe1c507e7069b9d81ddb701e060fff0293e
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/kernels/resource_variable_op.cpp
@@ -0,0 +1,138 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+
+#include "npu_custom_kernel.h"
+#include "npu_utils.h"
+
+class AssignVariableGraphBuilder {
+ public:
+  static tensorflow::GraphDef GetGraph(const std::string &op_name, const std::string &container_name,
+                                       const std::string &shared_name, const tensorflow::Tensor &tensor,
+                                       TF_Status *status) {
+    tensorflow::GraphDef gdef;
+
+    tensorflow::Graph graph(tensorflow::OpRegistry::Global());
+    tensorflow::Node *variable;
+    tensorflow::Node *value;
+    tensorflow::Node *assign_variable;
+
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder(WrapResourceName(shared_name), "VarHandleOp")
+                                   .Attr("container", container_name)
+                                   .Attr("shared_name", shared_name)
+                                   .Attr("dtype", tensor.dtype())
+                                   .Attr("shape", tensor.shape())
+                                   .Finalize(&graph, &variable),
+                               gdef);
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder(op_name + "_Value_" + shared_name, "Const")
+                                   .Attr("value", tensor)
+                                   .Attr("dtype", tensor.dtype())
+                                   .Finalize(&graph, &value),
+                               gdef);
+    NPU_CTX_REQUIRES_OK_RETURN(status,
+                               tensorflow::NodeBuilder(op_name + "_" + shared_name, op_name)
+                                   .Input(variable, 0)
+                                   .Input(value, 0)
+                                   .Attr("dtype", tensor.dtype())
+                                   .Finalize(&graph, &assign_variable),
+                               gdef);
+
+    AssembleOpDef(variable);
+    AssembleOpDef(value);
+    AssembleOpDef(assign_variable);
+
+    AssembleOutputDesc(TensorShapes({kScalarShape}), {tensorflow::DT_RESOURCE}, variable);
+    AssembleOutputDesc(TensorShapes({tensor.shape()}), {tensor.dtype()}, value);
+    AssembleInputDesc(TensorShapes({kScalarShape, tensor.shape()}), {tensorflow::DT_RESOURCE, tensor.dtype()},
+                      assign_variable);
+
+    graph.ToGraphDef(&gdef);
+    return gdef;
+  }
+};
+
+namespace {
+
+void VariableOpBaseKernel(const std::string &op_name, TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec,
+                          const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef, int num_inputs,
+                          TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) {
+  const tensorflow::Tensor *handle = nullptr;
+  const tensorflow::Tensor *value = nullptr;
+
+  std::vector<TFE_TensorHandle *> copied_tensor_handles;
+  TFE_TensorHandle *value_handle = inputs[1];
+  if (IsNpuTensorHandle(npu::UnwrapHandle(inputs[1]))) {
+    value_handle = dev->CopyTensorD2H(context, inputs[1], status);
+    if (TF_GetCode(status) != TF_OK) return;
+    copied_tensor_handles.emplace_back(value_handle);
+  }
+
+  NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(inputs[0], &handle));
+  auto resource = handle->scalar<tensorflow::ResourceHandle>()();
+  NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(value_handle, &value));
+  DLOG() << "Start run " << op_name << " for resource " << resource.DebugString() << " with value "
+         << value->DebugString();
+  auto var_init_graph =
+      AssignVariableGraphBuilder::GetGraph(op_name, resource.container(), resource.name(), *value, status);
+  if (TF_GetCode(status) != TF_OK) { return; }
+  std::string graph_name = op_name + "_" + resource.name();
+  if (kDumpExecutionDetail && kDumpGraph) {
+    std::string file_name = graph_name + ".pbtxt";
+    WriteTextProto(tensorflow::Env::Default(), file_name, var_init_graph);
+    LOG(INFO) << "NPU Dump variable resource init graph to: " << file_name;
+  }
+
+  for (auto copied_tensor_handle : copied_tensor_handles) { TFE_DeleteTensorHandle(copied_tensor_handle); }
+  dev->RunGeGraphPin2CpuAnonymous(context, graph_name, var_init_graph, num_inputs, inputs, num_outputs, outputs,
+                                  status);
+}
+
+}  // namespace
+
+static auto kernel_assign = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec,
+                               const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef,
+                               int num_inputs, TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs,
+                               TF_Status *status) {
+  VariableOpBaseKernel("AssignVariableOp", context, dev, spec, output_shapes, parser_ndef, num_inputs, inputs,
+                       num_outputs, outputs, status);
+};
+
+static auto kernel_assign_add = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec,
+                                   const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef,
+                                   int num_inputs, TFE_TensorHandle **inputs, int num_outputs,
+                                   TFE_TensorHandle **outputs, TF_Status *status) {
+  VariableOpBaseKernel("AssignAddVariableOp", context, dev, spec, output_shapes, parser_ndef, num_inputs, inputs,
+                       num_outputs, outputs, status);
+};
+
+static auto kernel_assign_sub = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec,
+                                   const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef,
+                                   int num_inputs, TFE_TensorHandle **inputs, int num_outputs,
+                                   TFE_TensorHandle **outputs, TF_Status *status) {
+  VariableOpBaseKernel("AssignSubVariableOp", context, dev, spec, output_shapes, parser_ndef, num_inputs, inputs,
+                       num_outputs, outputs, status);
+};
+
+NPU_REGISTER_CUSTOM_KERNEL("AssignVariableOp", kernel_assign);
+NPU_REGISTER_CUSTOM_KERNEL("AssignAddVariableOp", kernel_assign_add);
+NPU_REGISTER_CUSTOM_KERNEL("AssignSubVariableOp", kernel_assign_sub);
diff --git a/tf_adapter_2.x/npu_device/kernels/send_h2d.cpp b/tf_adapter_2.x/npu_device/kernels/send_h2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d960505d4665ced46735f7f3799a0bce346a4de0
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/kernels/send_h2d.cpp
@@ -0,0 +1,54 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/util/env_var.h"
+
+#include "npu_hdc.h"
+
+using namespace tensorflow;
+
+class SendH2D : public OpKernel {
+ public:
+  explicit SendH2D(OpKernelConstruction *ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("channel_name", &channel_name_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ids", &device_ids_));
+  }
+
+  void Compute(OpKernelContext *ctx) override {
+    if (!initialized_.exchange(true)) {
+      std::stringstream ss;
+      for (auto device_id : device_ids_) { ss << device_id << " "; }
+      channels_.resize(device_ids_.size());
+      for (size_t i = 0; i < device_ids_.size(); i++) {
+        OP_REQUIRES_OK(ctx, HdcChannel::Create(device_ids_[i], channel_name_, &channels_[i]));
+      }
+      LOG(INFO) << "Hdc channel for iterator resource " << channel_name_ << " to device ["
+                << ss.str().substr(0, ss.str().size() - 1) << "] created";
+    }
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &inputs));
+    std::vector<Tensor> tensors;
+    for (int64 i = 0; i < inputs.size(); i++) { tensors.push_back(inputs[i]); }
+    for (auto channel : channels_) { OP_REQUIRES_OK(ctx, channel->SendTensors(tensors)); }
+  }
+
+ private:
+  std::string channel_name_;
+  std::vector<int> device_ids_;
+  std::vector<std::shared_ptr<HdcChannel>> channels_;
+  std::atomic_bool initialized_{false};
+};
+
+REGISTER_KERNEL_BUILDER(Name("SendH2D").Device(DEVICE_CPU).Priority(3), SendH2D);
diff --git a/tf_adapter_2.x/npu_device/kernels/var_is_initialized_op.cpp b/tf_adapter_2.x/npu_device/kernels/var_is_initialized_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1f290f12c133bcd1e6843af45750c5f6d06fe43
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/kernels/var_is_initialized_op.cpp
@@ -0,0 +1,35 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+
+#include "npu_custom_kernel.h"
+#include "npu_utils.h"
+
+static auto kernel = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec,
+                        const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef, int num_inputs,
+                        TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) {
+  // TODO:这里需要先判断下是否已经初始化
+  tensorflow::Tensor tensor(tensorflow::DT_BOOL, {});
+  tensor.scalar<bool>()() = true;
+  outputs[0] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(tensor));
+};
+
+NPU_REGISTER_CUSTOM_KERNEL("VarIsInitializedOp", kernel);
diff --git a/tf_adapter_2.x/npu_device/ops/custom_op.cpp b/tf_adapter_2.x/npu_device/ops/custom_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30529ba99470a5abd43fe3187894d81181a9c3a6
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/ops/custom_op.cpp
@@ -0,0 +1,63 @@
+/**
+* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+* Description: Common depends and micro defines for and only for data preprocess module
+*/
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/util/env_var.h"
+
+using namespace tensorflow;
+
+class FakeOp : public AsyncOpKernel {
+ public:
+  explicit FakeOp(OpKernelConstruction *context) : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext *context, DoneCallback done) override {
+    OP_REQUIRES_OK_ASYNC(
+        context, errors::Internal(context->op_kernel().name(), " registered as fake op and should never run on cpu"),
+        done);
+  }
+};
+
+REGISTER_OP("DPOP")
+    .Input("inputs: Tin")
+    .Output("outputs: Tout")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("function: func")
+    .Attr("data_format: { 'NHWC', 'NCHW'} = 'NHWC'")
+    .SetIsStateful();
+
+REGISTER_OP("DeviceQueueDataset")
+    .Output("handle: variant")
+    .Attr("channel_name: string")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("SendH2D")
+    .Input("inputs: Tin")
+    .Attr("channel_name: string")
+    .Attr("device_ids: list(int)")
+    .Attr("Tin: list(type) = [DT_FLOAT, DT_HALF, DT_INT8, DT_INT32, DT_UINT8, DT_INT16, DT_UINT16, DT_UINT32, "
+          "DT_INT64, DT_UINT64, DT_DOUBLE, DT_BOOL, DT_STRING]")
+    .SetIsStateful();
+
+REGISTER_OP("IteratorH2D")
+    .Input("input: resource")
+    .Attr("channel_name: string")
+    .Attr("device_ids: list(int)")
+    .SetIsStateful();
+
+REGISTER_KERNEL_BUILDER(Name("DPOP").Device(DEVICE_CPU).Priority(3), FakeOp);
+REGISTER_KERNEL_BUILDER(Name("DeviceQueueDataset").Device(DEVICE_CPU).Priority(3), FakeOp);
diff --git a/tf_adapter_2.x/npu_device/python/MANIFEST.in b/tf_adapter_2.x/npu_device/python/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..b6beccfcab7d89ccff6d6201017f412909323c51
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/python/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include * *.py
+recursive-include * *.so
\ No newline at end of file
diff --git a/tf_adapter_2.x/npu_device/python/npu_device/__init__.py b/tf_adapter_2.x/npu_device/python/npu_device/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4f4b7201094fd85e40e8a10f893786d67d80ab6
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/python/npu_device/__init__.py
@@ -0,0 +1 @@
+from npu_device.npu_device_register import *
\ No newline at end of file
diff --git a/tf_adapter_2.x/npu_device/python/npu_device/npu_device_register.py b/tf_adapter_2.x/npu_device/python/npu_device/npu_device_register.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e27dc2c145404faa57f9fb2cc4297bf8919d5c7
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/python/npu_device/npu_device_register.py
@@ -0,0 +1,77 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
+# Description: Common depends and micro defines for and only for data preprocess module
+
+import tensorflow as tf
+from tensorflow.python.eager import context
+from sys import version_info as _swig_python_version_info
+
+if _swig_python_version_info < (2, 7, 0):
+    raise RuntimeError("Python 2.7 or later required")
+
+NPU = "/job:localhost/replica:0/task:0/device:NPU"
+
+# Import the low-level C/C++ module
+if __package__ or "." in __name__:
+    from . import _npu_device_backends
+else:
+    import _npu_device_backends
+
+
+def stupid_repeat(word, times):
+    return _npu_device_backends.StupidRepeat(word, times)
+
+
+def open(ctx=None, device_index=0, global_options={}, session_options={}):
+    if ctx is None:
+        ctx = context.context()
+    ctx.ensure_initialized()
+    error_message = _npu_device_backends.Open(ctx._handle, NPU, device_index, global_options, session_options)
+    if len(error_message):
+        raise RuntimeError("Failed open npu device " + str(device_index) + ":" + error_message)
+    return NpuDeviceHandle(ctx, device_index)
+
+
+def close():
+    _npu_device_backends.Close()
+
+
+import atexit
+
+atexit.register(close)
+from tensorflow.python.util import tf_contextlib
+
+
+class NpuDeviceHandle(object):
+    def __init__(self, ctx, device_index):
+        self._ctx = ctx
+        self._device_name = NPU + ":" + str(device_index)
+
+    def name(self):
+        return self._device_name
+
+    def scope(self):
+        @tf_contextlib.contextmanager
+        def _scope():
+            with self._ctx.device(self._device_name):
+                yield
+
+        return _scope()
+
+    def as_default(self):
+        from tensorflow.python.framework import device as pydev
+        from tensorflow.python.framework import ops
+
+        @tf_contextlib.contextmanager
+        def combined():
+            try:
+                with context.device(self._device_name):
+                    yield
+            except ImportError:  # ImportError: sys.meta_path is None, Python is likely shutting down
+                yield
+
+        def _f(*args, **kwargs):
+            return combined()
+
+        ops.device = _f
+        self._ctx._set_device(self._device_name, pydev.DeviceSpec.from_string(self._device_name))
+        return self
diff --git a/tf_adapter_2.x/npu_device/python/setup.py b/tf_adapter_2.x/npu_device/python/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fb8a9c4b3e03c1999f5668ef1e8c077ca4fd564
--- /dev/null
+++ b/tf_adapter_2.x/npu_device/python/setup.py
@@ -0,0 +1,11 @@
+from setuptools import setup, Extension
+from setuptools import find_packages
+
+setup(name='npu_device',
+      version='0.1',
+      description='This is a demo package',
+      long_description='This is a demo package',
+      packages=find_packages(),
+      include_package_data=True,
+      ext_modules=[],
+      zip_safe=False)