diff --git a/tf_adapter_2.x/CI_build b/tf_adapter_2.x/CI_build new file mode 100644 index 0000000000000000000000000000000000000000..9e1d1c2c12443a5ea607dac25825d012f655c2de --- /dev/null +++ b/tf_adapter_2.x/CI_build @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e +set -o pipefail + +cd $(cd "$(dirname $0)"; pwd) + +rm -rf build +mkdir build +cd build +cmake .. -DPYTHON_BIN_PATH=$(which python3) -DPYTHON_INCLUDE_DIR=$(python3 -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())") -DASCEND_CI_BUILD_DIR=$(cd $(pwd)/../../../; pwd) +make -j8 diff --git a/tf_adapter_2.x/CMakeLists.txt b/tf_adapter_2.x/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab3aa1e3153c0c4f7ddd30650c6856b26ea65b92 --- /dev/null +++ b/tf_adapter_2.x/CMakeLists.txt @@ -0,0 +1,65 @@ +cmake_minimum_required(VERSION 3.14) +project(NpuDevice) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_C_FLAGS "-O2 -DNDEBUG -Wno-deprecated-declarations -Wall -fPIC -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -s -pipe ${CMAKE_C_FLAGS}") +set(CMAKE_CXX_FLAGS "-O2 -DNDEBUG -Wno-deprecated-declarations -Wall -fPIC -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -s -pipe ${CMAKE_CXX_FLAGS}") + + +if (DEFINED ASCEND_CI_BUILD_DIR) + set(CMAKE_C_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=0 ${CMAKE_C_FLAGS}") + set(CMAKE_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=0 ${CMAKE_CXX_FLAGS}") + include_directories(${PYTHON_INCLUDE_DIR}) +else() + if (NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/tools/COMPILE_FLAGS OR NOT EXISTS + ${CMAKE_CURRENT_LIST_DIR}/tools/TF_INSTALLED_PATH OR NOT EXISTS + ${CMAKE_CURRENT_LIST_DIR}/tools/ASCEND_INSTALLED_PATH OR NOT EXISTS + ${CMAKE_CURRENT_LIST_DIR}/tools/PYTHON_BIN_PATH) + message(FATAL_ERROR "No validate configuration found. Did you forget to configure first?") + endif () + + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/tools/TF_INSTALLED_PATH" TF_INSTALLED_PATH) + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/tools/ASCEND_INSTALLED_PATH" ASCEND_INSTALLED_PATH) + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/tools/PYTHON_BIN_PATH" PYTHON_BIN_PATH) + + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/tools/COMPILE_FLAGS" CUSTOM_COMPILE_FLAGS) + foreach (COMPILE_FLAG ${CUSTOM_COMPILE_FLAGS}) + set(CMAKE_C_FLAGS "${COMPILE_FLAG} ${CMAKE_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${COMPILE_FLAG} ${CMAKE_CXX_FLAGS}") + endforeach (COMPILE_FLAG) +endif () + +include(${CMAKE_CURRENT_LIST_DIR}/cmake/acl/module.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/cmake/tensorflow/module.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/cmake/graph_engine/module.cmake) + +include_directories(${CMAKE_CURRENT_LIST_DIR}/npu_device/core) + +file(COPY ${CMAKE_CURRENT_LIST_DIR}/npu_device/python DESTINATION ${CMAKE_BINARY_DIR}/dist) + +file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_LIST_DIR}/npu_device/*.cpp) + +IF (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU) + add_definitions(-Wno-builtin-macro-redefined) +ENDIF () + +foreach (CPP_SOURCE ${SOURCES}) + file(RELATIVE_PATH RELATIVE_CPP_SOURCE ${CMAKE_CURRENT_LIST_DIR} ${CPP_SOURCE}) + set_property(SOURCE ${CPP_SOURCE} PROPERTY COMPILE_DEFINITIONS __FILE__=\"${RELATIVE_CPP_SOURCE}\") +endforeach (CPP_SOURCE) + +add_library(_npu_device_backends SHARED ${SOURCES}) +set_target_properties(_npu_device_backends PROPERTIES PREFIX "") +set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/dist/python/npu_device) + +target_link_libraries(_npu_device_backends PRIVATE + tensorflow_libs + ge_libs + acl_libs) + +add_custom_command(TARGET _npu_device_backends + POST_BUILD + COMMAND cd ${CMAKE_BINARY_DIR}/dist/python/ && ${PYTHON_BIN_PATH} setup.py bdist_wheel + VERBATIM) + +install(CODE "execute_process(COMMAND ${PYTHON_BIN_PATH} -m pip install ${CMAKE_BINARY_DIR}/dist/python/dist/npu_device-0.1-py3-none-any.whl --upgrade)") diff --git a/tf_adapter_2.x/README.md b/tf_adapter_2.x/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b988f7feca8a8556102dc25ba76e1f77b86be173 --- /dev/null +++ b/tf_adapter_2.x/README.md @@ -0,0 +1,99 @@ +# Ascend Adapter for TF2.X +## 安装 + +### 从源码安装 + +您可以从源代码构建 Ascend Adapter 软件包并将其安装在昇腾AI处理器环境上。 +> Ascend Adapter 与 Tensorflow 有严格的匹配关系,从源码构建前,您需要确保已经正确安装了[Tensorflow v2.4.0版本](https://www.tensorflow.org/install) 。 + + +同时系统满足以下要求: +- Linux OS +- GCC >= 7.3.0 +- CMake >= 3.14.0 + +#### 下载源码 + +``` +git clone ssh://git@10.95.128.221:2222/x00373192/AscendTF2.git +cd AscendTF2 +``` + +#### 配置安装环境 +```BASH +./configure +``` +默认情况下,执行上述命会弹出如下的交互式会话窗口 +> 您的会话可能有所不同。 + +```BASH +Please specify the location of python with available tensorflow v2.4.0 installed. [Default is /usr/bin/python3] +(You can make this quiet by set env [ADAPTER_TARGET_PYTHON_PATH]): +``` +此时,要求您输入安装了 Tensorflow v2.4.0 版本的python解释器路径,如果默认路径是正确的,直接回车,否则请输入正确的 python +解释器路径。 +> 您可以通过设置 ADAPTER_TARGET_PYTHON_PATH的环境变量,来抑制交互式窗口弹出,但是要确保路径是有效的,否则,仍然会要求您输入正确的 python 解释器路径。 + +键入后,会耗费几秒钟以确保您的输入是有效的,接着,会弹出下面的交互式窗口 +``` +Please specify the location of ascend. [Default is /usr/local/Ascend] +(You can make this quiet by set env [ASCEND_INSTALLED_PATH]): +``` +此时,要求您输入昇腾处理器开发套件的安装路径,如果默认路径是正确的,直接回车,否则请输入正确的昇腾处理器开发套件安装路径。 + +> 您可以通过设置ASCEND_INSTALLED_PATH的环境变量,来抑制交互式窗口弹出,但是要确保路径是有效的,否则,仍然会要求您输入正确的昇腾处理器开发套件安装路径。 + +键入后,等待配置完成。 +#### 配置cmake +> 根据您的网络状况,可能需要数分钟来下载Ascend Adapter的依赖项目以完成配置。 + +``` +mkdir build +cd build +cmake .. +``` + +#### 执行编译 +> 您应当根据实际编译环境,设置合适的并发编译数以提升编译速度。 + +```BASH +make -j8 +``` + +编译结束后,安装包会生成在 +``` +./dist/python/dist/npu_device-0.1-py3-none-any.whl +``` + +#### 安装 +您可以继续执行 +```BASH +make install +``` +将Ascend Adapter安装到配置时指定的 python 解释器包目录下,或者使用 pip3 安装 Ascend Adapter 到您期望的位置。 +``` +pip3 install ./dist/python/dist/npu_device-0.1-py3-none-any.whl --upgrade +``` +需要注意的是, 您应当保证安装路径与您编译时指定的 python 解释器搜索路径是一致的。 + +#### 基础功能测试 +在执行脚本前,您应当将昇腾处理器开发套件的库目录加入搜索目录,如您的安装目录是/usr/local/Ascend,则应当执行如下命令 +``` +export LD_LIBRARY_PATH=/usr/local/Ascend/fwkacllib/lib64/ +``` +之后,可以执行示例脚本检测您的安装 +``` +python3 examples/basic_tests.py +``` + +## 贡献 + +psuh代码前,请务必保证已经完成了基础功能测试和网络测试! + +## Release Notes + +Release Notes请参考[RELEASE](RELEASE.md). + +## License + +[Apache License 2.0](LICENSE) diff --git a/tf_adapter_2.x/RELEASE.md b/tf_adapter_2.x/RELEASE.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tf_adapter_2.x/cmake/acl/module.cmake b/tf_adapter_2.x/cmake/acl/module.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8d81abded26ec1b358e4ff72379af045d06a2fe5 --- /dev/null +++ b/tf_adapter_2.x/cmake/acl/module.cmake @@ -0,0 +1,22 @@ +add_library(acl_libs INTERFACE) + +if(DEFINED ASCEND_INSTALLED_PATH) + include_directories(${ASCEND_INSTALLED_PATH}/fwkacllib/include) + target_link_libraries(acl_libs INTERFACE + ${ASCEND_INSTALLED_PATH}/fwkacllib/lib64/libascendcl.so + ${ASCEND_INSTALLED_PATH}/fwkacllib/lib64/libacl_op_compiler.so) +else() + include_directories(${ASCEND_CI_BUILD_DIR}/inc/external) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc + COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc + ) + + set(fake_sources ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc) + + add_library(ascendcl SHARED ${fake_sources}) + add_library(acl_op_compiler SHARED ${fake_sources}) + target_link_libraries(acl_libs INTERFACE + ascendcl + acl_op_compiler) +endif() \ No newline at end of file diff --git a/tf_adapter_2.x/cmake/graph_engine/module.cmake b/tf_adapter_2.x/cmake/graph_engine/module.cmake new file mode 100644 index 0000000000000000000000000000000000000000..0985d7110ea3aefbff78e6d6e20c27e93650d44a --- /dev/null +++ b/tf_adapter_2.x/cmake/graph_engine/module.cmake @@ -0,0 +1,28 @@ +add_library(ge_libs INTERFACE) + +if(DEFINED ASCEND_INSTALLED_PATH) + include_directories(${CMAKE_CURRENT_LIST_DIR}/../../../inc/graphengine/inc) + include_directories(${CMAKE_CURRENT_LIST_DIR}/../../../inc/graphengine/inc/external) + include_directories(${CMAKE_CURRENT_LIST_DIR}/../../../inc/metadef/inc) + include_directories(${CMAKE_CURRENT_LIST_DIR}/../../../inc/metadef/inc/external) + target_link_libraries(ge_libs INTERFACE + ${ASCEND_INSTALLED_PATH}/fwkacllib/lib64/libge_runner.so + ${ASCEND_INSTALLED_PATH}/fwkacllib/lib64/libfmk_parser.so) +else() + include_directories(${ASCEND_CI_BUILD_DIR}/graphengine/inc) + include_directories(${ASCEND_CI_BUILD_DIR}/graphengine/inc/external) + include_directories(${ASCEND_CI_BUILD_DIR}/metadef/inc) + include_directories(${ASCEND_CI_BUILD_DIR}/metadef/inc/external) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc + COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc + ) + + set(fake_sources ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc) + + add_library(ge_runner SHARED ${fake_sources}) + add_library(fmk_parser SHARED ${fake_sources}) + target_link_libraries(ge_libs INTERFACE + ge_runner + fmk_parser) +endif() \ No newline at end of file diff --git a/tf_adapter_2.x/cmake/tensorflow/module.cmake b/tf_adapter_2.x/cmake/tensorflow/module.cmake new file mode 100644 index 0000000000000000000000000000000000000000..72b39d7e6205c664c12828b0ded1fa727e72f605 --- /dev/null +++ b/tf_adapter_2.x/cmake/tensorflow/module.cmake @@ -0,0 +1,30 @@ +add_library(tensorflow_libs INTERFACE) + +if(DEFINED TF_INSTALLED_PATH) + SET(TF_INCLUDE_DIR ${TF_INSTALLED_PATH}) + target_link_libraries(tensorflow_libs INTERFACE + ${TF_INSTALLED_PATH}/python/_pywrap_tensorflow_internal.so + ${TF_INSTALLED_PATH}/libtensorflow_framework.so.2) +else() + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc + COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc + ) + + set(fake_sources ${CMAKE_CURRENT_BINARY_DIR}/_fake.cc) + + add_library(tensorflow_framework SHARED ${fake_sources}) + set_target_properties(tensorflow_framework PROPERTIES VERSION 2) + + add_library(pywrap_tensorflow_internal SHARED ${fake_sources}) + set_target_properties(pywrap_tensorflow_internal PROPERTIES PREFIX _) + + SET(TF_INCLUDE_DIR ${ASCEND_CI_BUILD_DIR}/third_party/tensorflow/compile_deps/tf-2.4.0) + target_link_libraries(tensorflow_libs INTERFACE + tensorflow_framework + pywrap_tensorflow_internal) +endif() + +include_directories(${TF_INCLUDE_DIR}/include) +include_directories(${TF_INCLUDE_DIR}/include/external/farmhash_archive/src) +include_directories(${TF_INCLUDE_DIR}/include/external/pybind11/_virtual_includes/pybind11) \ No newline at end of file diff --git a/tf_adapter_2.x/compile b/tf_adapter_2.x/compile new file mode 100644 index 0000000000000000000000000000000000000000..2f24227dacb29366b4d4a2927ae65e675fcc035f --- /dev/null +++ b/tf_adapter_2.x/compile @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e +set -o pipefail + +rm -rf build +mkdir build +cd build +cmake .. +make -j8 diff --git a/tf_adapter_2.x/configure b/tf_adapter_2.x/configure new file mode 100644 index 0000000000000000000000000000000000000000..a953879ec96b3860b016b111c38c2b1ad419ef84 --- /dev/null +++ b/tf_adapter_2.x/configure @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e +set -o pipefail + +if [ -z "$PYTHON_BIN_PATH" ]; then + PYTHON_BIN_PATH=$(which python3 || which python || true) +fi + +# Set all env variables +CONFIGURE_DIR=$(dirname "$0") +"$PYTHON_BIN_PATH" "${CONFIGURE_DIR}/configure.py" "$@" + +echo "Configuration finished" diff --git a/tf_adapter_2.x/configure.py b/tf_adapter_2.x/configure.py new file mode 100644 index 0000000000000000000000000000000000000000..393d457923426f04dffaaf12fb4e70f99fa277e4 --- /dev/null +++ b/tf_adapter_2.x/configure.py @@ -0,0 +1,136 @@ +# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import subprocess +import sys + +try: + from shutil import which +except ImportError: + from distutils.spawn import find_executable as which + +_COMPAT_TENSORFLOW_VERSION = "2.4.0" +_PYTHON_BIN_PATH_ENV = "ADAPTER_TARGET_PYTHON_PATH" +_ASCEND_INSTALLED_PATH_ENV = "ASCEND_INSTALLED_PATH" + +def run_command(cmd): + output = subprocess.check_output(cmd) + return output.decode('UTF-8').strip() + + +def get_input(question): + try: + try: + answer = raw_input(question) + except NameError: + answer = input(question) + except EOFError: + answer = '' + return answer + + +def real_config_path(file): + return os.path.join("tools", file) + + +def setup_python(env_path): + """Get python install path.""" + default_python_bin_path = sys.executable + ask_python_bin_path = ('Please specify the location of python with valid ' + 'tensorflow 2.4.0 site-packages installed. [Default ' + 'is %s]\n(You can make this quiet by set env [ADAPTER_TARGET_PYTHON_PATH]): ') % default_python_bin_path + custom_python_bin_path = env_path + while True: + if not custom_python_bin_path: + python_bin_path = get_input(ask_python_bin_path) + else: + python_bin_path = custom_python_bin_path + custom_python_bin_path = None + if not python_bin_path: + python_bin_path = default_python_bin_path + pass + # Check if the path is valid + if os.path.isfile(python_bin_path) and os.access(python_bin_path, os.X_OK): + pass + elif not os.path.exists(python_bin_path): + print('Invalid python path: %s cannot be found.' % python_bin_path) + continue + else: + print('%s is not executable. Is it the python binary?' % python_bin_path) + continue + + try: + compile_args = run_command([ + python_bin_path, '-c', + 'import distutils.sysconfig; import tensorflow as tf; print(tf.__version__ + "|" + tf.sysconfig.get_lib(' + ') + "|" + "|".join(tf.sysconfig.get_compile_flags()) + "|" + distutils.sysconfig.get_python_inc())' + ]).split("|") + if not compile_args[0].startswith(_COMPAT_TENSORFLOW_VERSION): + print('Invalid python path: %s compat tensorflow version is %s' + ' got %s.' % (python_bin_path, _COMPAT_TENSORFLOW_VERSION, + compile_args[0])) + continue + except subprocess.CalledProcessError: + print('Invalid python path: %s tensorflow not installed.' % + python_bin_path) + continue + # Write tools/python_bin_path.sh + with open(real_config_path('PYTHON_BIN_PATH'), 'w') as f: + f.write(python_bin_path) + with open(real_config_path('COMPILE_FLAGS'), 'w') as f: + for flag in compile_args[2:-1]: + f.write(flag + '\n') + f.write("-I" + compile_args[-1] + '\n') + with open(real_config_path('TF_INSTALLED_PATH'), 'w') as f: + f.write(compile_args[1]) + break + + +def setup_ascend(env_path): + """Get ascend install path.""" + default_ascend_path = "/usr/local/Ascend" + ask_ascend_path = ('Please specify the location of ascend. [Default is ' + '%s]\n(You can make this quiet by set env [ASCEND_INSTALLED_PATH]): ') % default_ascend_path + custom_ascend_path = env_path + while True: + if not custom_ascend_path: + ascend_path = get_input(ask_ascend_path) + else: + ascend_path = custom_ascend_path + custom_ascend_path = None + if not ascend_path: + ascend_path = default_ascend_path + # Check if the path is valid + if os.path.isdir(ascend_path) and os.access(ascend_path, os.X_OK): + break + elif not os.path.exists(ascend_path): + print('Invalid ascend path: %s cannot be found.' % ascend_path) + + with open(real_config_path('ASCEND_INSTALLED_PATH'), 'w') as f: + f.write(ascend_path) + + +def main(): + env_snapshot = dict(os.environ) + setup_python(env_snapshot.get(_PYTHON_BIN_PATH_ENV)) + setup_ascend(env_snapshot.get(_ASCEND_INSTALLED_PATH_ENV)) + + +if __name__ == '__main__': + main() diff --git a/tf_adapter_2.x/docs/framework.jpg b/tf_adapter_2.x/docs/framework.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e7729bfb507a80ad8423c67d7dd41d6a16ff691d Binary files /dev/null and b/tf_adapter_2.x/docs/framework.jpg differ diff --git a/tf_adapter_2.x/examples/basic_tests.py b/tf_adapter_2.x/examples/basic_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..998302458d78d5a10de801b9b226c1583988742a --- /dev/null +++ b/tf_adapter_2.x/examples/basic_tests.py @@ -0,0 +1,440 @@ +import unittest +import os +import tensorflow as tf +from tensorflow.python.eager import context +from tensorflow.python.ops import gen_resource_variable_ops + +import npu_device + +npu = npu_device.open().as_default() + + +def tensor_equal(t1, t2): + return (t1.numpy() == t2.numpy()).all() + + +@tf.function +def foo_add(v1, v2): + return v1 + v2 + + +@tf.function +def foo_add_(v): + return v.assign_add(1) + + +@tf.function +def foo_cpu_add_(v): + with context.device("/job:localhost/replica:0/task:0/device:CPU:0"): + return v.assign_add(1) + + +class RaiseTest(unittest.TestCase): + def test_raise1(self): + with context.device("/job:localhost/replica:0/task:0/device:CPU:0"): + x = tf.Variable(1) + y = tf.Variable(1) + self.assertRaises(tf.errors.InvalidArgumentError, foo_add, x, y) + + def test_basic1(self): + self.assertTrue(tensor_equal(foo_add(1, 2), tf.constant(3))) + + def test_basic2(self): + self.assertTrue(tensor_equal(tf.add(1, 2), tf.constant(3))) + + def test_basic3(self): + x = tf.Variable(1) + self.assertTrue(tensor_equal(foo_add_(x), tf.constant(2))) + + def test_basic4(self): + with context.device("/job:localhost/replica:0/task:0/device:CPU:0"): + x = tf.Variable(1) + self.assertTrue(tensor_equal(foo_add_(x), tf.constant(2))) + + def test_basic5(self): + with context.device("/job:localhost/replica:0/task:0/device:CPU:0"): + x = tf.Variable(1) + self.assertTrue(tensor_equal(foo_cpu_add_(x), tf.constant(2))) + + def test_basic6(self): # Force run on npu by tensorflow + x = tf.Variable(1) + self.assertTrue(tensor_equal(foo_cpu_add_(x), tf.constant(2))) + + def test_basic7(self): # Force run on npu by tensorflow + x = tf.Variable(1) + self.assertTrue(x.device == npu.name()) + self.assertTrue(foo_cpu_add_(x).device == "/job:localhost/replica:0/task:0/device:CPU:0") + with context.device("/job:localhost/replica:0/task:0/device:CPU:0"): + x = tf.Variable(1) + self.assertTrue(foo_add_(x).device == "/job:localhost/replica:0/task:0/device:CPU:0") + + def test_shared_variable(self): + x = gen_resource_variable_ops.var_handle_op(dtype=tf.float32, shape=(1, 2), shared_name="variable_1") + gen_resource_variable_ops.assign_variable_op(x, tf.constant([[1.0, 2.0]])) + y = gen_resource_variable_ops.var_handle_op(dtype=tf.float32, shape=(1, 2), shared_name="variable_1") + gen_resource_variable_ops.assign_variable_op(y, tf.constant([[2.0, 3.0]])) + read_x = gen_resource_variable_ops.read_variable_op(x, dtype=tf.float32) + read_y = gen_resource_variable_ops.read_variable_op(y, dtype=tf.float32) + self.assertTrue(tensor_equal(read_x, read_y)) + + x = gen_resource_variable_ops.var_handle_op(dtype=tf.float32, shape=(1, 2), shared_name=context.shared_name()) + gen_resource_variable_ops.assign_variable_op(x, tf.constant([[1.0, 2.0]])) + y = gen_resource_variable_ops.var_handle_op(dtype=tf.float32, shape=(1, 2), shared_name=context.shared_name()) + gen_resource_variable_ops.assign_variable_op(y, tf.constant([[2.0, 3.0]])) + read_x = gen_resource_variable_ops.read_variable_op(x, dtype=tf.float32) + read_y = gen_resource_variable_ops.read_variable_op(y, dtype=tf.float32) + self.assertFalse(tensor_equal(read_x, read_y)) + + def test_anonymous_variable(self): + x = tf.Variable([[1.0, 2.0]], dtype=tf.float32, name="x") + y = tf.Variable([[1.0, 2.0]], dtype=tf.float32, name="x") + x.assign_add([[1.0, 1.0]]) + self.assertFalse(tensor_equal(x, y)) + + def test_matmul(self): + input = tf.constant([[1.0], [2.0]]) + weight = tf.Variable([[2.0, 1.0]], dtype=tf.float32) + logit = tf.matmul(input, weight) + self.assertTrue(tensor_equal(logit, tf.constant([[2., 1.], [4., 2.]]))) + + def test_unique(self): + x = tf.constant([1, 1, 2, 4, 4, 4, 7, 8, 8]) + y, idx = tf.unique(x) + self.assertTrue(tensor_equal(y, tf.constant([1, 2, 4, 7, 8]))) + + def test_dataset(self): + dataset = tf.data.Dataset.from_tensor_slices(tf.constant([2])) + iterator = iter(dataset) + self.assertTrue(tensor_equal(next(iterator), tf.constant(2))) + try: + next(iterator) + except Exception as e: + self.assertTrue(isinstance(e, StopIteration)) + + def test_dataset_function(self): + dataset = tf.data.Dataset.from_tensor_slices(tf.constant([2])) + iterator = iter(dataset) + + @tf.function + def f(iterator): + return next(iterator) + + self.assertTrue(tensor_equal(f(iterator), tf.constant(2))) + self.assertRaises(tf.errors.OutOfRangeError, f, iterator) + + def test_checkpoint(self): + step = tf.Variable(0, name="step") # 0 + checkpoint = tf.train.Checkpoint(step=step) + checkpoint.write("./ckpt") + step.assign_add(1) # 1 + checkpoint.read("./ckpt") + self.assertTrue(tensor_equal(step, tf.constant(0))) + + def test_same_python_name_function(self): + def f1(): + @tf.function + def f(x): + return x + 1 + + return f(tf.constant(1)) + + def f2(): + @tf.function + def f(x): + return x + 2 + + return f(tf.constant(1)) + + self.assertTrue(tensor_equal(f1(), tf.constant(2))) + self.assertTrue(tensor_equal(f2(), tf.constant(3))) + + def test_cond1(self): + cond = tf.Variable(1.0) + x = tf.Variable(1.0) + y = tf.Variable(2.0) + + @tf.function + def f(): + tf.cond(cond < tf.constant(2.0), lambda: x.assign_add(y), lambda: y.assign_add(x)) + return x, y + + v1, v2 = f() + self.assertTrue(tensor_equal(v1, tf.constant(3.0))) + self.assertTrue(tensor_equal(v2, tf.constant(2.0))) + + def test_cond2(self): + cond = tf.Variable(1.0) + x = tf.Variable(0.0) + y = tf.Variable(0.0) + + @tf.function + def f(): + tf.cond(cond < tf.constant(2.0), lambda: x.assign_add(1.0), lambda: y.assign_add(1.0)) + return x, y + + v1, v2 = f() + self.assertTrue(tensor_equal(v1, tf.constant(1.0))) + self.assertTrue(tensor_equal(v2, tf.constant(0.0))) + + def test_cond3(self): + v = tf.Variable(1.0) + x = tf.Variable(0.0) + y = tf.Variable(0.0) + + def x_add(): + return x.assign_add(1.0) + + def y_add(): + return y.assign_add(1.0) + + @tf.function + def f(): + tf.cond(v < tf.constant(2.0), x_add, y_add) + return x, y + + v1, v2 = f() + self.assertTrue(tensor_equal(v1, tf.constant(1.0))) + self.assertTrue(tensor_equal(v2, tf.constant(0.0))) + + def test_cond4(self): + v = tf.Variable(1.0) + x = tf.Variable(0.0) + y = tf.Variable(0.0) + + @tf.function + def x_add(): + return x.assign_add(1.0) + + @tf.function + def y_add(): + return y.assign_add(1.0) + + @tf.function + def f(): + tf.cond(v < tf.constant(2.0), x_add, y_add) + return x, y + + v1, v2 = f() + self.assertTrue(tensor_equal(v1, tf.constant(1.0))) + self.assertTrue(tensor_equal(v2, tf.constant(0.0))) + + def test_cond5(self): + v = tf.Variable(1.0) + x = tf.Variable(0.0) + y = tf.Variable(0.0) + + c = tf.constant(1.0) + + @tf.function + def x_add(): + return x.assign_add(c) + + @tf.function + def y_add(): + return y.assign_add(c) + + @tf.function + def f(): + tf.cond(v < tf.constant(2.0), x_add, y_add) + return x, y + + v1, v2 = f() + self.assertTrue(tensor_equal(v1, tf.constant(1.0))) + self.assertTrue(tensor_equal(v2, tf.constant(0.0))) + + def test_cond6(self): + cond = tf.Variable(1.0) + x = tf.Variable(1.0) + y = tf.Variable(2.0) + + @tf.function + def f(): + return tf.cond(cond < tf.constant(2.0), lambda: x.assign_add(y), lambda: y.assign_add(x)) + + self.assertTrue(tensor_equal(f(), tf.constant(3.0))) + + def test_while(self): + v = tf.Variable(1.0) + + @tf.function + def f(): + for i in tf.range(10): + v.assign_add(1.0) + return v + + self.assertTrue(tensor_equal(f(), tf.constant(11.0))) + + def test_variable_need_different_format_in_subgraph_with_control(self): + x = tf.Variable(tf.constant([[[[0.0]]]]), dtype=tf.float32, shape=(1, 1, 1, 1)) + + @tf.function + def f(): + xv = tf.cond(x < tf.constant([[[[2.0]]]]), lambda: x.assign(tf.constant([[[[10.0]]]])), + lambda: x.assign(tf.constant([[[[20.0]]]]))) + return tf.nn.conv2d(xv, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID') + self.assertTrue(tensor_equal(f(), tf.constant([[[[30.0]]], ], dtype=tf.float32))) + self.assertTrue(tensor_equal(x, tf.constant([[[[10.0]]], ], dtype=tf.float32))) + + def test_variable_need_different_format_in_subgraph(self): + x = tf.Variable(tf.constant([[[[0.0]]]]), dtype=tf.float32, shape=(1, 1, 1, 1)) + + @tf.function + def f(): + tf.cond(x < tf.constant([[[[2.0]]]]), lambda: x.assign(tf.constant([[[[10.0]]]])), + lambda: x.assign(tf.constant([[[[20.0]]]]))) + return tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID') + self.assertTrue(tensor_equal(f(), tf.constant([[[[30.0]]], ], dtype=tf.float32))) + self.assertTrue(tensor_equal(x, tf.constant([[[[10.0]]], ], dtype=tf.float32))) + + def test_variable_need_different_format_in_subgraph_cross(self): + x = tf.Variable(tf.constant([[[[10.0]]]]), dtype=tf.float32, shape=(1, 1, 1, 1)) + + @tf.function + def f(): + c1 = tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID') + tf.cond(x < tf.constant([[[[2.0]]]]), lambda: x.assign(tf.constant([[[[10.0]]]])), + lambda: x.assign(tf.constant([[[[20.0]]]]))) + return c1, tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID') + c1, c2 = f() + self.assertTrue(tensor_equal(c1, tf.constant([[[[30.0]]], ], dtype=tf.float32))) + self.assertTrue(tensor_equal(c2, tf.constant([[[[60.0]]], ], dtype=tf.float32))) + self.assertTrue(tensor_equal(x, tf.constant([[[[20.0]]], ], dtype=tf.float32))) + + def test_variable_need_different_format_in_subgraph_trans_merge(self): + x = tf.Variable(tf.constant([[[[10.0]]]]), dtype=tf.float32, shape=(1, 1, 1, 1)) + + @tf.function + def f(): + c1 = tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID') + c2 = tf.nn.conv2d(x, tf.constant([[[[3.0]]], ], dtype=tf.float32), strides=[1, 1, 1, 1], padding='VALID') + tf.cond(x < tf.constant([[[[2.0]]]]), lambda: x.assign(tf.constant([[[[10.0]]]])), + lambda: x.assign(tf.constant([[[[20.0]]]]))) + return c1, c2 + c1, c2 = f() + self.assertTrue(tensor_equal(c1, tf.constant([[[[30.0]]], ], dtype=tf.float32))) + self.assertTrue(tensor_equal(c2, tf.constant([[[[30.0]]], ], dtype=tf.float32))) + self.assertTrue(tensor_equal(x, tf.constant([[[[20.0]]], ], dtype=tf.float32))) + + def test_bert_dp_under_one_device_distribute_strategy(self): + def decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.io.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.cast(t, tf.int32) + example[name] = t + + return example + + def dataset_fn(ctx=None): + """Creates input dataset from (tf)records files for pretraining.""" + input_patterns = [os.path.join(os.path.dirname(os.path.abspath(__file__)), "bert_examples.tfrecord")] + seq_length = 128 + max_predictions_per_seq = 20 + batch_size = 32 + is_training = True + input_pipeline_context = None + use_next_sentence_label = True + use_position_id = False + output_fake_labels = True + + name_to_features = { + 'input_ids': + tf.io.FixedLenFeature([seq_length], tf.int64), + 'input_mask': + tf.io.FixedLenFeature([seq_length], tf.int64), + 'segment_ids': + tf.io.FixedLenFeature([seq_length], tf.int64), + 'masked_lm_positions': + tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64), + 'masked_lm_ids': + tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64), + 'masked_lm_weights': + tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32), + } + if use_next_sentence_label: + name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1], + tf.int64) + if use_position_id: + name_to_features['position_ids'] = tf.io.FixedLenFeature([seq_length], + tf.int64) + for input_pattern in input_patterns: + if not tf.io.gfile.glob(input_pattern): + raise ValueError('%s does not match any files.' % input_pattern) + + dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training) + + if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1: + dataset = dataset.shard(input_pipeline_context.num_input_pipelines, + input_pipeline_context.input_pipeline_id) + if is_training: + dataset = dataset.repeat() + + # We set shuffle buffer to exactly match total number of + # training files to ensure that training data is well shuffled. + input_files = [] + for input_pattern in input_patterns: + input_files.extend(tf.io.gfile.glob(input_pattern)) + dataset = dataset.shuffle(len(input_files)) + + # # In parallel, create tf record dataset for each train files. + # # cycle_length = 8 means that up to 8 files will be read and deserialized in + # # parallel. You may want to increase this number if you have a large number of + # # CPU cores. + dataset = dataset.interleave( + tf.data.TFRecordDataset, + cycle_length=8, + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + if is_training: + dataset = dataset.shuffle(100) + + decode_fn = lambda record: decode_record(record, name_to_features) + dataset = dataset.map( + decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) + + def _select_data_from_record(record): + """Filter out features to use for pretraining.""" + x = { + 'input_word_ids': record['input_ids'], + 'input_mask': record['input_mask'], + 'input_type_ids': record['segment_ids'], + 'masked_lm_positions': record['masked_lm_positions'], + 'masked_lm_ids': record['masked_lm_ids'], + 'masked_lm_weights': record['masked_lm_weights'], + } + if use_next_sentence_label: + x['next_sentence_labels'] = record['next_sentence_labels'] + if use_position_id: + x['position_ids'] = record['position_ids'] + + # TODO(hongkuny): Remove the fake labels after migrating bert pretraining. + if output_fake_labels: + return (x, record['masked_lm_weights']) + else: + return x + + dataset = dataset.map( + _select_data_from_record, + num_parallel_calls=tf.data.experimental.AUTOTUNE) + dataset = dataset.batch(batch_size, drop_remainder=is_training) + dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + return dataset + + strategy = tf.distribute.OneDeviceStrategy("device:CPU:0") + dataset = strategy.experimental_distribute_datasets_from_function(dataset_fn) + iterator = iter(dataset) + + @tf.function + def bert_step(iterator): + return next(iterator) + + bert_step(iterator) + + +if __name__ == '__main__': + unittest.main() diff --git a/tf_adapter_2.x/examples/bert_examples.tfrecord b/tf_adapter_2.x/examples/bert_examples.tfrecord new file mode 100644 index 0000000000000000000000000000000000000000..ea54c17bc279cc7e8027a9b95831dcf9221539a4 Binary files /dev/null and b/tf_adapter_2.x/examples/bert_examples.tfrecord differ diff --git a/tf_adapter_2.x/npu_device/core/npu_cache_spec.h b/tf_adapter_2.x/npu_device/core/npu_cache_spec.h new file mode 100644 index 0000000000000000000000000000000000000000..9f4787b45e18fe9fe288b69de2be41f7d826398b --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_cache_spec.h @@ -0,0 +1,194 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_CACHE_SPEC_H +#define TENSORFLOW_NPU_CACHE_SPEC_H + +#include "tensorflow/c/eager/c_api.h" + +#include +#include +#include +#include +#include +#include + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/eager/c_api_internal.h" + +#include "npu_logger.h" +#include "npu_parser.h" +#include "npu_types.h" + +namespace npu { + +class TaskSpec { + public: + virtual bool IsFunctionOp() const = 0; + bool ShouldFallback() const { return !fallback_reason_.empty(); }; + std::string FallbackReason() const { return fallback_reason_; }; + std::string Op() const { return ndef_.op(); } + virtual std::string DebugString() const = 0; + tensorflow::NodeDef NodeDef() const { return ndef_; } + const TensorDataTypes &InputTypes() const { return input_dtypes_; } + const TensorShapes &InputShapes() const { return input_shapes_; } + const TensorDataTypes &OutputTypes() const { return output_dtypes_; } + virtual const tensorflow::OpRegistrationData *OpRegistrationData() const { return op_spec_; } + + protected: + TaskSpec() : op_spec_(nullptr){}; + ~TaskSpec() = default; + const tensorflow::OpRegistrationData *op_spec_; // 算子IR注册的信息,非实例 + tensorflow::NodeDef ndef_; // 节点的NodeDef,主要存储实例化属性信息 + TensorDataTypes input_dtypes_; + TensorShapes input_shapes_; + TensorDataTypes output_dtypes_; + std::string fallback_reason_; +}; + +class OpSpec : public TaskSpec { + public: + OpSpec(const tensorflow::OpRegistrationData *op_spec, tensorflow::NodeDef ndef, TensorShapes input_shapes, + TensorPartialShapes output_shapes, std::string reason) + : always_infer_shape_(false), partial_output_shapes_(output_shapes) { + TensorDataTypes input_dtypes; + TensorDataTypes output_dtypes; + tensorflow::InOutTypesForNode(ndef, op_spec->op_def, &input_dtypes, &output_dtypes); + op_spec_ = op_spec; + ndef_ = std::move(ndef); + input_dtypes_ = std::move(input_dtypes); + input_shapes_ = std::move(input_shapes); + output_dtypes_ = std::move(output_dtypes); + + fallback_reason_ = std::move(reason); + if (ShouldFallback()) { return; } + TensorShapes shapes; + shapes.resize(output_shapes.size()); + for (size_t i = 0; i < output_shapes.size(); i++) { + // 如果不是函数算子,那么必须要求inferShape输出确定的结果 + if (!output_shapes[i].AsTensorShape(&shapes[i])) { + fallback_reason_ = tensorflow::strings::StrCat("output", i, " unknown shape ", output_shapes[i].DebugString()); + break; + } + } + + if (!ShouldFallback()) { + output_shapes_ = shapes; + AssembleInputDesc(input_shapes_, input_dtypes_, &attached_attrs_); + AssembleOutputDesc(output_shapes_, output_dtypes_, &attached_attrs_); + } + } + + OpSpec(const tensorflow::OpRegistrationData *op_spec, tensorflow::NodeDef ndef, TensorShapes input_shapes, + std::string reason) + : always_infer_shape_(true) { + TensorDataTypes input_dtypes; + TensorDataTypes output_dtypes; + tensorflow::InOutTypesForNode(ndef, op_spec->op_def, &input_dtypes, &output_dtypes); + + op_spec_ = op_spec; + ndef_ = std::move(ndef); + input_dtypes_ = std::move(input_dtypes); + input_shapes_ = std::move(input_shapes); + output_dtypes_ = std::move(output_dtypes); + fallback_reason_ = std::move(reason); + + if (!ShouldFallback()) { AssembleInputDesc(input_shapes_, input_dtypes_, &attached_attrs_); } + } + + ~OpSpec() = default; + bool IsFunctionOp() const override { return false; } + bool ShouldInferShape() const { return always_infer_shape_; } + const TensorShapes &OutputShapes() const { return output_shapes_; } + const TensorPartialShapes &OutputPartialShapes() const { return partial_output_shapes_; } + tensorflow::NodeDef ParserNodeDef() const { + tensorflow::NodeDef ndef; + ndef.MergeFrom(ndef_); + ndef.MergeFrom(attached_attrs_); + return ndef; + } + std::string DebugString() const override { + std::stringstream ss; + ss << NodeDef().DebugString() << std::endl; + ss << attached_attrs_.DebugString() << std::endl; + ss << OpRegistrationData()->op_def.DebugString() << std::endl; + for (size_t i = 0; i < output_dtypes_.size(); i++) { + if (always_infer_shape_ || ShouldFallback()) { + ss << "output " << i << " " << tensorflow::DataTypeString(output_dtypes_[i]) << " " << std::endl; + } else { + ss << "output " << i << " " << tensorflow::DataTypeString(output_dtypes_[i]) << " " + << partial_output_shapes_[i].DebugString() << std::endl; + } + } + if (ShouldFallback()) { ss << "Fallback reason " << fallback_reason_; } + return ss.str(); + } + + private: + bool always_infer_shape_; + TensorShapes output_shapes_; + TensorPartialShapes partial_output_shapes_; + tensorflow::NodeDef attached_attrs_; +}; + +class FuncSpec : public TaskSpec { + using TensorDataTypes = tensorflow::gtl::InlinedVector; + + public: + using PruneInputsFunc = + std::function &)>; + FuncSpec(const tensorflow::OpRegistrationData *op_spec, tensorflow::NodeDef ndef, uint64_t ge_graph_id, + std::unique_ptr graph, PruneInputsFunc prune_func, + std::vector dependent_host_resources, std::string reason = "") + : ge_graph_id_(ge_graph_id), graph_(std::move(graph)), prune_func_(std::move(prune_func)), + dependent_host_resources_(std::move(dependent_host_resources)) { + + TensorDataTypes input_dtypes; + TensorDataTypes output_dtypes; + tensorflow::InOutTypesForNode(ndef, op_spec->op_def, &input_dtypes, &output_dtypes); + + op_spec_ = op_spec; + ndef_ = std::move(ndef); + input_dtypes_ = std::move(input_dtypes); + output_dtypes_ = std::move(output_dtypes); + fallback_reason_ = std::move(reason); + } + ~FuncSpec() = default; + bool IsFunctionOp() const override { return true; } + + uint64_t GeGraphId() const { return ge_graph_id_; } + + const std::vector& DependentHostResources() const { return dependent_host_resources_; } + + const tensorflow::Graph *Graph() const { return graph_.get(); } + + void PruneInputs(int num_inputs, TFE_TensorHandle **inputs, std::vector &pruned) const { + prune_func_(num_inputs, inputs, pruned); + } + std::string DebugString() const override { + std::stringstream ss; + ss << NodeDef().DebugString() << std::endl; + ss << OpRegistrationData()->op_def.DebugString() << std::endl; + ss << "Ge graph id " << ge_graph_id_ << std::endl; + for (size_t i = 0; i < output_dtypes_.size(); i++) { + ss << "output " << i << " " << tensorflow::DataTypeString(output_dtypes_[i]) << std::endl; + } + if (ShouldFallback()) { ss << "Fallback reason " << fallback_reason_; } + return ss.str(); + } + + private: + uint64_t ge_graph_id_; + std::unique_ptr graph_; + PruneInputsFunc prune_func_; + const std::vector dependent_host_resources_; +}; +} // namespace npu + +#endif //TENSORFLOW_NPU_CACHE_SPEC_H diff --git a/tf_adapter_2.x/npu_device/core/npu_custom_kernel.h b/tf_adapter_2.x/npu_device/core/npu_custom_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..49aed1b99f40adc24526b41ccb97ac5ca0b4ecbf --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_custom_kernel.h @@ -0,0 +1,122 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_CUSTOM_KERNEL_H +#define TENSORFLOW_NPU_CUSTOM_KERNEL_H + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" + +#include "npu_device.h" +#include "npu_logger.h" +#include "npu_micros.h" +#include "npu_parser.h" +#include "npu_unwrap.h" +#include "npu_utils.h" + +using NpuCustomKernelFunc = + std::function; + +using NpuFallbackHookFunc = std::function; + +class CustomKernelRegistry { + public: + static CustomKernelRegistry &Instance() { + static CustomKernelRegistry inst; + return inst; + } + void Register(const std::string &op_name, const NpuCustomKernelFunc &func) { + std::lock_guard lk(mu_); + DCHECK(specific_kernels_.find(op_name) == specific_kernels_.end()); + specific_kernels_.emplace(std::make_pair(op_name, func)); + } + void RegisterHook(const std::string &op_name, const NpuFallbackHookFunc &func) { + std::lock_guard lk(mu_); + DCHECK(specific_kernels_.find(op_name) == specific_kernels_.end()); + specific_hooks_.emplace(std::make_pair(op_name, func)); + } + + bool GetCustomKernelFunc(const std::string &op_name, NpuCustomKernelFunc **func) { + DLOG() << "NPU Looking up custom kernel for " << op_name; + std::lock_guard lk(mu_); + if (specific_kernels_.find(op_name) == specific_kernels_.end()) { + DLOG() << "NPU Looking up kernel not found for op " << op_name; + return false; + } + *func = &specific_kernels_[op_name]; + return true; + } + + bool GetFallbackHookFunc(const std::string &op_name, NpuFallbackHookFunc **func) { + DLOG() << "NPU Looking up callback hook for " << op_name; + std::lock_guard lk(mu_); + if (specific_hooks_.find(op_name) == specific_hooks_.end()) { + DLOG() << "NPU Callback hook not found for op " << op_name; + return false; + } + *func = &specific_hooks_[op_name]; + return true; + } + + private: + CustomKernelRegistry() = default; + std::mutex mu_; + std::map specific_kernels_; + std::map specific_hooks_; +}; + +class CustomKernelSpec { + public: + CustomKernelSpec(std::string name, NpuCustomKernelFunc custom_func) + : op(std::move(name)), func(std::move(custom_func)) {} + std::string op; + NpuCustomKernelFunc func; +}; + +class FallbackHookSpec { + public: + FallbackHookSpec(std::string name, NpuFallbackHookFunc custom_func) + : op(std::move(name)), func(std::move(custom_func)) {} + std::string op; + NpuFallbackHookFunc func; +}; + +class CustomKernelReceiver { + public: + CustomKernelReceiver(const CustomKernelSpec &spec) { // NOLINT(google-explicit-constructor) + DLOG() << "NPU Register custom kernel for " << spec.op; + CustomKernelRegistry::Instance().Register(spec.op, spec.func); + } + + CustomKernelReceiver(const FallbackHookSpec &spec) { // NOLINT(google-explicit-constructor) + DLOG() << "NPU Register fallback hook for " << spec.op; + CustomKernelRegistry::Instance().RegisterHook(spec.op, spec.func); + } +}; + +#define NPU_REGISTER_CUSTOM_KERNEL(name, func) NPU_REGISTER_CUSTOM_KERNEL_1(__COUNTER__, name, func) +#define NPU_REGISTER_CUSTOM_KERNEL_1(ctr, name, func) NPU_REGISTER_CUSTOM_KERNEL_2(ctr, name, func) +#define NPU_REGISTER_CUSTOM_KERNEL_2(ctr, name, func) \ + static CustomKernelReceiver __preserved_op##ctr = CustomKernelSpec(name, func) + +#define NPU_REGISTER_FALLBACK_HOOK(name, func) NPU_REGISTER_FALLBACK_HOOK_1(__COUNTER__, name, func) +#define NPU_REGISTER_FALLBACK_HOOK_1(ctr, name, func) NPU_REGISTER_FALLBACK_HOOK_2(ctr, name, func) +#define NPU_REGISTER_FALLBACK_HOOK_2(ctr, name, func) \ + static CustomKernelReceiver __preserved_op##ctr = FallbackHookSpec(name, func) + +#endif //TENSORFLOW_NPU_CUSTOM_KERNEL_H diff --git a/tf_adapter_2.x/npu_device/core/npu_device.cpp b/tf_adapter_2.x/npu_device/core/npu_device.cpp new file mode 100644 index 0000000000000000000000000000000000000000..76206b16e854c393b335be57dc003a8748c91191 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_device.cpp @@ -0,0 +1,1625 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include +#include + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" + +#include "npu_custom_kernel.h" +#include "npu_device.h" +#include "npu_dp.h" +#include "npu_env.h" +#include "npu_logger.h" +#include "npu_micros.h" +#include "npu_parser.h" +#include "npu_unwrap.h" +#include "npu_utils.h" + +#include "framework/common/ge_inner_error_codes.h" +#include "framework/omg/parser/model_parser.h" +#include "framework/omg/parser/parser_factory.h" + +using Format = ge::Format; + +namespace { +template +class NpuHostFixedAllocator : public tensorflow::Allocator { + public: + static tensorflow::Allocator *Create(std::unique_ptr ptr) { + return new (std::nothrow) NpuHostFixedAllocator(std::move(ptr)); + } + + private: + explicit NpuHostFixedAllocator(std::unique_ptr ptr) : ptr_(std::move(ptr)) { + DLOG() << "Zero copied ge tensor " << reinterpret_cast(ptr_.get()); + } + ~NpuHostFixedAllocator() override { + DLOG() << "Release zero copied ge tensor " << reinterpret_cast(ptr_.get()); + }; + std::string Name() override { return "NpuHostFixedAllocator"; } + void *AllocateRaw(size_t alignment, size_t num_bytes) override { return ptr_.get(); } + void DeallocateRaw(void *ptr) override { delete this; } + std::unique_ptr ptr_; +}; +} // namespace + +tensorflow::Status NpuDevice::ConsumeIteratorAsync(const tensorflow::ResourceHandle &resource, int64_t nums, + const DoneCallback &done) { + auto iter = iterator_providers_.find(resource); + if (iter == iterator_providers_.end()) { + return tensorflow::errors::Internal("Iterator resource provider not found for resource ", resource.name()); + } + auto provider = iter->second; + return provider->Consume(nums, done); +} + +tensorflow::Status NpuDevice::ConsumeIteratorSync(const tensorflow::ResourceHandle &resource, int64_t nums) { + tensorflow::Notification done; + auto status = tensorflow::Status::OK(); + ConsumeIteratorAsync(resource, nums, [&status, &done](tensorflow::Status s) { + status = std::move(s); + done.Notify(); + }); + done.WaitForNotification(); + return status; +} + +void NpuDevice::CreateIteratorProvider(TFE_Context *context, const tensorflow::Tensor *tensor, + std::vector device_ids, TF_Status *status) { + auto resource = tensor->scalar()(); + TensorPartialShapes shapes; + TensorDataTypes types; + NPU_CTX_REQUIRES_OK(status, GetMirroredIteratorShapesAndTypes(resource, shapes, types)); + auto dp_provider = + IteratorResourceProvider::GetFunctionDef(resource.name(), std::move(device_ids), shapes, types, status); + if (TF_GetCode(status) != TF_OK) return; + + tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef(); + NPU_CTX_REQUIRES_OK(status, lib_def->AddFunctionDef(dp_provider)); + tensorflow::ProcessFunctionLibraryRuntime *pflr = npu::UnwrapCtx(context)->pflr(); + tensorflow::FunctionLibraryRuntime *flr = pflr->GetFLR(underlying_device); + tensorflow::FunctionLibraryRuntime::Handle f_handle; + NPU_CTX_REQUIRES_OK(status, flr->Instantiate(dp_provider.signature().name(), tensorflow::AttrSlice{}, &f_handle)); + + tensorflow::Tensor captured_tensor = *tensor; + auto consume_func = [flr, f_handle, captured_tensor]() -> tensorflow::Status { + std::vector get_next_outputs; + return flr->RunSync(tensorflow::FunctionLibraryRuntime::Options{}, f_handle, {captured_tensor}, &get_next_outputs); + }; + auto destroy_func = [resource, flr, f_handle]() -> tensorflow::Status { + LOG(INFO) << "Stopping iterator resource provider for " << resource.name(); + return flr->ReleaseHandle(f_handle); + }; + + auto provider = std::make_shared(resource.name(), consume_func, destroy_func); + LOG(INFO) << "Iterator resource provider for " << resource.name() << " created"; + + NPU_CTX_REQUIRES(status, provider != nullptr, + tensorflow::errors::Internal("Failed create iterator reosurce provider for ", resource.name())); + + iterator_providers_[resource] = provider; + + if (kDumpExecutionDetail || kDumpGraph) { + std::unique_ptr fbody; + tensorflow::AttrSlice attr_slice; + tensorflow::FunctionDefToBodyHelper(dp_provider, attr_slice, lib_def, &fbody); + std::string file_name = "dp_provider_" + resource.name() + ".pbtxt"; + WriteTextProto(tensorflow::Env::Default(), file_name, fbody->graph->ToGraphDefDebug()); + } +} + +std::string NpuDevice::CreateDevice(const char *name, int device_index, + const std::map &session_options, NpuDevice **device) { + auto *ge_session = new (std::nothrow) ge::Session(session_options); + if (ge_session == nullptr) { return "Failed init graph engine: create new session failed"; } + + std::shared_ptr parser = + domi::ModelParserFactory::Instance()->CreateModelParser(domi::FrameworkType::TENSORFLOW); + if (parser == nullptr) { return "Failed init graph engine: create tensorflow model parser failed"; } + + std::unique_ptr status(TF_NewStatus(), TF_DeleteStatus); + + *device = new (std::nothrow) NpuDevice(); + if (*device == nullptr) { return "Failed create new npu device instance"; } + (*device)->device_id = device_index; + (*device)->device_name = name; + (*device)->underlying_device = "/job:localhost/replica:0/task:0/device:CPU:0"; + (*device)->ge_session_ = ge_session; + return ""; +} + +void NpuDevice::ReleaseResource() { + for (auto &iterator_provider : iterator_providers_) { iterator_provider.second->Destroy(); } +} + +void NpuDevice::DeleteDevice(void *device) { + DLOG() << "Start destroy npu device instance"; + if (device == nullptr) { return; } + auto npu_device = reinterpret_cast(device); + delete npu_device->ge_session_; + delete npu_device; +} + +tensorflow::Status NpuDevice::ValidateResourcePlacement(const char *op_name, int num_inputs, TFE_TensorHandle **inputs, + bool &cpu_resource) { + bool has_cpu = false; + int cpu_index = 0; + bool has_npu = false; + int npu_index = 0; + for (int i = 0; i < num_inputs; i++) { + auto data_type = npu::UnwrapHandle(inputs[i])->DataType(); + if (data_type == tensorflow::DT_RESOURCE) { + const tensorflow::Tensor *tensor; + (void) npu::UnwrapTensor(inputs[i], &tensor); + if (IsNpuTensorHandle(npu::UnwrapHandle(inputs[i]))) { + has_npu = true; + npu_index = i; + if (has_cpu) { + const tensorflow::Tensor *cpu_tensor; + (void) npu::UnwrapTensor(inputs[cpu_index], &cpu_tensor); + return tensorflow::errors::InvalidArgument( + op_name, " resource input ", i, " ", tensor->scalar()().name(), + " on NPU but resource input ", cpu_index, " ", cpu_tensor->scalar()().name(), + " on CPU"); + } + } else if (!Mirrored(tensor->scalar()())) { + has_cpu = true; + cpu_index = i; + if (has_npu) { + const tensorflow::Tensor *npu_tensor; + (void) npu::UnwrapTensor(inputs[npu_index], &npu_tensor); + return tensorflow::errors::InvalidArgument( + op_name, " resource input ", i, " ", tensor->scalar()().name(), + " on CPU but resource input ", npu_index, " ", npu_tensor->scalar()().name(), + " on NPU"); + } + } + } + } + cpu_resource = has_cpu; + return tensorflow::Status::OK(); +} + +tensorflow::Status NpuDevice::ValidateInput(const char *op_name, int num_inputs, TFE_TensorHandle **inputs) { + for (int i = 0; i < num_inputs; i++) { + auto data_type = npu::UnwrapHandle(inputs[i])->DataType(); + if (data_type == tensorflow::DT_RESOURCE) { + const tensorflow::Tensor *tensor; + NPU_REQUIRES_OK(npu::UnwrapTensor(inputs[i], &tensor)); + if (!IsNpuTensorHandle(npu::UnwrapHandle(inputs[i]))) { + if (!Mirrored(tensor->scalar()())) { + tensorflow::Status status; + std::string src_name = npu::UnwrapHandle(inputs[i])->DeviceName(&status); + if (!status.ok()) { src_name = status.ToString(); } + return tensorflow::errors::Unimplemented("Op ", op_name, " input ", i, " resource from ", src_name); + } else { + DLOG() << "Op" << op_name << " input " << i << " resource mirrored from " + << tensor->scalar()().DebugString(); + } + } + } else if (!tensorflow::DataTypeCanUseMemcpy(data_type)) { + return tensorflow::errors::Unimplemented("Op ", op_name, " input ", i, " unsupported type ", + tensorflow::DataTypeString(data_type)); + } + } + return tensorflow::Status::OK(); +} + +tensorflow::Status NpuDevice::ValidateOutput(const char *op_name, const TensorDataTypes &data_types) { + for (size_t i = 0; i < data_types.size(); i++) { + auto data_type = data_types[i]; + if (data_type == tensorflow::DT_RESOURCE) { + if (!SupportedResourceGenerator(op_name)) { + return tensorflow::errors::Unimplemented("Op ", op_name, " unsupported resource generator by NPU"); + } + } else if (!tensorflow::DataTypeCanUseMemcpy(data_type)) { + return tensorflow::errors::Unimplemented("Op ", op_name, " output ", i, " unsupported type ", + tensorflow::DataTypeString(data_type)); + } + } + return tensorflow::Status::OK(); +} + +void NpuDevice::PruneFunction(const tensorflow::FunctionDef &fdef, tensorflow::Graph *g, bool keep_signature) { + std::unordered_set control_ret_nodes; + for (const auto &control_ret : fdef.control_ret()) { control_ret_nodes.insert(control_ret.second); } + + std::unordered_set nodes; + for (auto n : g->nodes()) { + if (n->IsControlFlow() || n->op_def().is_stateful() + || (control_ret_nodes.find(n->name()) != control_ret_nodes.end())) { + if (n->type_string() == "VarHandleOp" || n->type_string() == "IteratorV2") { continue; } + if (!keep_signature) { + if (n->IsArg()) { continue; } + if (n->IsRetval() && n->attrs().Find("T")->type() == tensorflow::DT_RESOURCE) { continue; } + } + nodes.insert(n); + } + } + bool changed = PruneForReverseReachability(g, std::move(nodes)); + if (changed) { FixupSourceAndSinkEdges(g); } +} + +void NpuDevice::FixGraphArgRetvalIndex(tensorflow::Graph *graph) { + std::map indexed_args; + std::map indexed_retvals; + for (auto node : graph->nodes()) { + if (node->IsArg()) { indexed_args[node->attrs().Find("index")->i()] = node; } + if (node->IsRetval()) { indexed_retvals[node->attrs().Find("index")->i()] = node; } + } + int current_arg_index = 0; + for (auto indexed_arg : indexed_args) { indexed_arg.second->AddAttr("index", current_arg_index++); } + + int current_retval_index = 0; + for (auto indexed_retval : indexed_retvals) { indexed_retval.second->AddAttr("index", current_retval_index++); } +} + +tensorflow::Status +NpuDevice::TransResourceInput2GraphNode(TFE_Context *context, tensorflow::Graph *graph, int num_inputs, + TFE_TensorHandle **inputs, + std::vector &dependent_host_resources) { + std::set arg_is_variable; + std::set arg_is_iterator; + + std::map arg_resource_handles; + + VecTensorDataTypes arg_handle_dtyes(num_inputs); + VecTensorPartialShapes arg_handle_shapes(num_inputs); + + for (int i = 0; i < num_inputs; i++) { + if (inputs[i] == nullptr) { continue; }; + const tensorflow::Tensor *tensor; + NPU_REQUIRES_OK(npu::UnwrapTensor(inputs[i], &tensor)); + if (tensor->dtype() == tensorflow::DT_RESOURCE) { + auto handle = tensor->flat()(0); + arg_resource_handles[i] = handle; + if (MirroredIterator(handle)) { + GetMirroredIteratorShapesAndTypes(handle, arg_handle_shapes[i], arg_handle_dtyes[i]); + arg_is_iterator.insert(i); + } else { + const auto &dtypes_and_shapes = handle.dtypes_and_shapes(); + for (auto &dtype_and_shape : dtypes_and_shapes) { + arg_handle_dtyes[i].push_back(dtype_and_shape.dtype); + arg_handle_shapes[i].push_back(dtype_and_shape.shape); + } + arg_is_variable.insert(i); + } + } + } + + std::map arg_substitutes; + for (auto node : graph->op_nodes()) { + if (node->IsArg()) { + auto index = node->attrs().Find("index")->i(); + if (arg_is_iterator.count(index)) { + NPU_REQUIRES_OK(tensorflow::NodeBuilder(WrapResourceName(arg_resource_handles[index].name()), "IteratorV2") + .Attr("container", arg_resource_handles[index].container()) + .Attr("shared_name", arg_resource_handles[index].name()) + .Attr("output_types", arg_handle_dtyes[index]) + .Attr("output_shapes", arg_handle_shapes[index]) + .Attr("_arg_name", node->name()) + .Attr("_arg_index", int(index)) + .Finalize(graph, &arg_substitutes[node])); + + } else if (arg_is_variable.count(index)) { + tensorflow::Node *variable = nullptr; + NPU_REQUIRES_OK(tensorflow::NodeBuilder(WrapResourceName(arg_resource_handles[index].name()), "VarHandleOp") + .Attr("container", arg_resource_handles[index].container()) + .Attr("shared_name", arg_resource_handles[index].name()) + .Attr("dtype", arg_handle_dtyes[index][0]) + .Attr("shape", arg_handle_shapes[index][0]) + .Attr("_arg_name", node->name()) + .Attr("_arg_index", int(index)) + .Finalize(graph, &arg_substitutes[node])); + } + } + } + + // 这里需要把涉及的function的resource输入也一并替换了 + std::vector nodes_to_remove; + std::vector control_flow_nodes; + std::set unique_dependent_resources; + for (auto node : graph->op_nodes()) { + if (node->IsRetval() && node->input_type(0) == tensorflow::DT_RESOURCE) { + nodes_to_remove.push_back(node); + continue; + } + if (node->IsIfNode() || node->IsCaseNode() || node->IsWhileNode() || node->IsFunctionCall()) { + std::string func_input_name = node->IsFunctionCall() ? "args" : "input"; + bool need_trans_resource = false; + for (auto edge : node->in_edges()) { + if (edge->src()->IsArg() && arg_substitutes.find(edge->src()) != arg_substitutes.end()) { + need_trans_resource = true; + } + } + if (!need_trans_resource) continue; + + control_flow_nodes.push_back(node); + + tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef(); + const tensorflow::OpRegistrationData *op_reg_data; + NPU_REQUIRES_OK(lib_def->LookUp(node->type_string(), &op_reg_data)); + int func_input_start = 0; + int func_input_end = 0; + for (const auto &in_arg : op_reg_data->op_def.input_arg()) { + func_input_start = func_input_end; + if (in_arg.type_list_attr().empty()) { + func_input_end++; + } else { + func_input_end += node->attrs().Find(in_arg.type_list_attr())->list().type_size(); + } + DLOG() << node->name() << " input arg " << in_arg.name() << " range [" << func_input_start << ", " + << func_input_end << ")"; + if (in_arg.name() == func_input_name) { break; } + } + + std::vector func_inputs; + for (int i = func_input_start; i < func_input_end; i++) { + const tensorflow::Edge *edge; + NPU_REQUIRES_OK(node->input_edge(i, &edge)); + if (edge->src()->IsArg() && arg_substitutes.find(edge->src()) != arg_substitutes.end()) { + func_inputs.push_back(inputs[edge->src()->attrs().Find("index")->i()]); + } else { + func_inputs.push_back(nullptr); + } + } + + for (auto &attr : node->attrs()) { + if (attr.second.has_func()) { + static std::atomic uuid{0}; + std::string func_name = node->type_string() + "_" + attr.first + "_" + attr.second.func().name() + "_" + + std::to_string(uuid.fetch_add(1)); + const tensorflow::FunctionDef *fdef = lib_def->Find(attr.second.func().name()); + std::unique_ptr fbody; + FunctionDefToBodyHelper(*fdef, tensorflow::AttrSlice{}, lib_def, &fbody); + std::vector unused_host_resources; + TransResourceInput2GraphNode(context, fbody->graph, func_inputs.size(), func_inputs.data(), + unused_host_resources); + + // Arg节点可能会被优化掉,因而需要重新排列index + std::vector remain_indexes; + for (auto n : fbody->graph->nodes()) { + if (n->IsArg()) { remain_indexes.push_back(n->attrs().Find("index")->i()); } + } + FixGraphArgRetvalIndex(fbody->graph); + DLOG() << func_name << " remained input index (0-" << func_inputs.size() - 1 << ") -> " + << VecToString(remain_indexes); + + tensorflow::FunctionDef optimized_fdef; + auto lookup = [&fdef](const tensorflow::Node *node) -> absl::optional { + for (const auto &control_ret : fdef->control_ret()) { + if (control_ret.second == node->name()) { return absl::make_optional(node->name()); } + } + return absl::nullopt; + }; + NPU_REQUIRES_OK(tensorflow::GraphToFunctionDef(*fbody->graph, func_name, lookup, &optimized_fdef)); + NPU_REQUIRES_OK(lib_def->AddFunctionDef(optimized_fdef)); + DLOG() << "Change " << node->name() << " attr " << attr.first << " func name " << attr.second.func().name() + << " to " << func_name; + const_cast(node->attrs().Find(attr.first))->mutable_func()->set_name(func_name); + } + } + } + + std::vector edges; + for (auto edge : node->in_edges()) { edges.emplace_back(edge); } // You can never modify and iterator an EdgeSet + for (auto edge : edges) { + if (edge->src()->IsArg()) { + auto iter = arg_substitutes.find(edge->src()); + if (iter != arg_substitutes.end()) { + int index = edge->src()->attrs().Find("index")->i(); + if (arg_is_iterator.count(index)) { unique_dependent_resources.insert(arg_resource_handles[index]); } + graph->AddEdge(iter->second, 0, node, edge->dst_input()); + graph->RemoveEdge(edge); + } + } + } + } + + for (const auto &resource : unique_dependent_resources) { dependent_host_resources.push_back(resource); } + + for (auto node : control_flow_nodes) { + if (node->IsWhileNode() || node->IsCaseNode() || node->IsWhileNode() || node->IsFunctionCall()) { + tensorflow::NodeDef ndef = node->def(); + if (node->IsWhileNode()) { + int removed_nums = 0; + for (int i = 0; i < node->num_inputs(); i++) { + if (node->input_type(i) == tensorflow::DT_RESOURCE) { + int index = i - removed_nums; + removed_nums++; + + ndef.mutable_input()->erase(ndef.mutable_input()->begin() + index); + + auto type = ndef.mutable_attr()->at("T").mutable_list()->mutable_type(); + type->erase(type->begin() + index); + + auto shape = ndef.mutable_attr()->at("output_shapes").mutable_list()->mutable_shape(); + shape->erase(shape->begin() + index); + } + } + } else if (node->IsCaseNode() || node->IsWhileNode() || node->IsFunctionCall()) { + int removed_nums = 0; + int arg_start_index = node->IsFunctionCall() ? 0 : 1; + for (int i = arg_start_index; i < node->num_inputs(); i++) { + if (node->input_type(i) == tensorflow::DT_RESOURCE) { + int index = i - removed_nums; + removed_nums++; + + ndef.mutable_input()->erase(ndef.mutable_input()->begin() + index); + + auto type = ndef.mutable_attr()->at("Tin").mutable_list()->mutable_type(); + type->erase(type->begin() + index - arg_start_index); + } + } + } + DLOG() << "Pruned control flow op " << ndef.DebugString(); + tensorflow::Status status; + auto pruned_node = graph->AddNode(ndef, &status); + NPU_REQUIRES_OK(status); + int pruned_input_index = 0; + for (int i = 0; i < node->num_inputs(); i++) { + const tensorflow::Edge *edge; + NPU_REQUIRES_OK(node->input_edge(i, &edge)); + if (node->input_type(i) != tensorflow::DT_RESOURCE) { + graph->AddEdge(edge->src(), edge->src_output(), pruned_node, pruned_input_index++); + } + } + for (auto n : graph->op_nodes()) { + for (auto edge : n->in_edges()) { + if (edge->src() == node) { graph->AddEdge(pruned_node, edge->src_output(), edge->dst(), edge->dst_input()); } + } + } + graph->RemoveNode(node); + } + } + for (auto node : nodes_to_remove) { graph->RemoveNode(node); } + for (auto arg_substitute : arg_substitutes) { graph->RemoveNode(arg_substitute.first); } + return tensorflow::Status::OK(); +} + +tensorflow::Status NpuDevice::MarkGraphNodeInOutDesc(TFE_Context *context, tensorflow::Graph *graph, int num_inputs, + TFE_TensorHandle **inputs) { + + tensorflow::ShapeRefiner shape_refiner(graph->versions(), npu::UnwrapCtx(context)->FuncLibDef()); + VecTensorShapes arg_shapes; + VecTensorDataTypes arg_handle_dtyes; + VecTensorPartialShapes arg_handle_shapes; + for (int i = 0; i < num_inputs; i++) { + const tensorflow::Tensor *tensor; + NPU_REQUIRES_OK(npu::UnwrapTensor(inputs[i], &tensor)); + arg_shapes.push_back({tensor->shape()}); + TensorDataTypes handle_dtyes; + TensorPartialShapes handle_shapes; + if (tensor->dtype() == tensorflow::DT_RESOURCE) { + auto handle = tensor->flat()(0); + const auto &dtypes_and_shapes = handle.dtypes_and_shapes(); + for (auto &dtype_and_shape : dtypes_and_shapes) { + handle_dtyes.push_back(dtype_and_shape.dtype); + handle_shapes.push_back(dtype_and_shape.shape); + } + } + arg_handle_dtyes.push_back(handle_dtyes); + arg_handle_shapes.push_back(handle_shapes); + } + + auto node_shape_inference_lambda = [&shape_refiner, num_inputs, inputs, &arg_shapes, &arg_handle_dtyes, + &arg_handle_shapes](tensorflow::Node *node) { + AssembleOpDef(node); + if (node->IsArg() && node->attrs().Find("index")) { + auto index = node->attrs().Find("index")->i(); + if (index < num_inputs && !node->attrs().Find("_output_shapes")) { + node->AddAttr("_output_shapes", arg_shapes[index]); + } + if (index < num_inputs && npu::UnwrapHandle(inputs[index])->DataType() == tensorflow::DT_RESOURCE) { + if (!node->attrs().Find("_handle_shapes")) { node->AddAttr("_handle_shapes", arg_handle_shapes[index]); } + if (!node->attrs().Find("_handle_dtypes")) { node->AddAttr("_handle_dtypes", arg_handle_dtyes[index]); } + } + } + auto status = shape_refiner.AddNode(node); + if (!status.ok()) { + LOG(INFO) << " " << node->name() << "[" << node->type_string() << "] Skip infer " << status.error_message(); + return; + } + auto node_ctx = shape_refiner.GetContext(node); + + DLOG() << "Shape of node " << node->DebugString(); + if (kDumpExecutionDetail) { + TensorDataTypes input_types; + tensorflow::InputTypesForNode(node->def(), node->op_def(), &input_types); + TensorPartialShapes input_shapes; + for (int i = 0; i < node_ctx->num_inputs(); ++i) { + tensorflow::TensorShapeProto proto; + node_ctx->ShapeHandleToProto(node_ctx->input(i), &proto); + input_shapes.emplace_back(proto); + LOG(INFO) << " input " << i << ": " << tensorflow::DataTypeString(input_types[i]) + << node_ctx->DebugString(node_ctx->input(i)); + } + } + + TensorDataTypes input_types; + TensorDataTypes output_types; + tensorflow::InOutTypesForNode(node->def(), node->op_def(), &input_types, &output_types); + + if (!input_types.empty()) { + tensorflow::AttrValue input_desc_attrs; + bool input_desc_incomplete = false; + for (auto edge : node->in_edges()) { + if (!edge->IsControlEdge()) { + auto input_attr = edge->src()->attrs().Find(kOutputDesc); + if (input_attr == nullptr) { + input_desc_incomplete = true; + LOG(WARNING) << node->DebugString() << " input node " << edge->src()->DebugString() + << " has no desc for output " << edge->src_output(); + break; + } + *input_desc_attrs.mutable_list()->add_func() = + edge->src()->attrs().Find(kOutputDesc)->list().func(edge->src_output()); + } + } + if (!input_desc_incomplete) { + node->AddAttr(kInputDesc, input_desc_attrs); + } else { + TensorPartialShapes input_shapes; + for (int i = 0; i < node_ctx->num_inputs(); ++i) { + tensorflow::TensorShapeProto proto; + node_ctx->ShapeHandleToProto(node_ctx->input(i), &proto); + input_shapes.emplace_back(proto); + } + AssembleInputDesc(input_shapes, input_types, node); + } + } + + if (!output_types.empty()) { + TensorPartialShapes output_shapes; + for (int i = 0; i < node_ctx->num_outputs(); ++i) { + tensorflow::TensorShapeProto proto; + node_ctx->ShapeHandleToProto(node_ctx->output(i), &proto); + output_shapes.emplace_back(proto); + DLOG() << " output " << i << ": " << tensorflow::DataTypeString(output_types[i]) + << node_ctx->DebugString(node_ctx->output(i)); + } + AssembleOutputDesc(output_shapes, output_types, node); + } + }; + tensorflow::ReverseDFS(*graph, {}, node_shape_inference_lambda); + return tensorflow::Status::OK(); +} + +TFE_TensorHandle *NpuDevice::NewDeviceTensorHandle(TFE_Context *context, Format fmt, + const tensorflow::TensorShape &shape, tensorflow::DataType type, + TF_Status *status) { + NpuManagedBuffer *npu_managed_buffer; + NPU_CTX_REQUIRES_OK_RETURN(status, NpuManagedBuffer::Create(fmt, shape, type, &npu_managed_buffer), nullptr); + std::vector dims; + for (auto dim_size : shape.dim_sizes()) { dims.emplace_back(dim_size); } + return TFE_NewTensorHandleFromDeviceMemory(context, device_name.c_str(), static_cast(type), dims.data(), + dims.size(), npu_managed_buffer, sizeof(npu_managed_buffer), + &NpuManagedBufferDeallocator, nullptr, status); +} + +TFE_TensorHandle *NpuDevice::NewDeviceResourceHandle(TFE_Context *context, const tensorflow::TensorShape &shape, + TF_Status *status) { + tensorflow::Tensor tensor(tensorflow::DT_RESOURCE, shape); + tensorflow::CustomDevice *custom_device = nullptr; + NPU_CTX_REQUIRES_RETURN(status, npu::UnwrapCtx(context)->FindCustomDeviceFromName(device_name, &custom_device), + tensorflow::errors::Internal("No custom device registered with name ", device_name), nullptr); + return tensorflow::wrap( + tensorflow::TensorHandle::CreateLocalHandle(std::move(tensor), custom_device, npu::UnwrapCtx(context))); +} + +TFE_TensorHandle *NpuDevice::CopyTensorD2H(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status) { + const tensorflow::Tensor *npu_tensor; + NPU_CTX_REQUIRES_OK_RETURN(status, npu::UnwrapTensor(tensor, &npu_tensor), nullptr); + + if (npu_tensor->dtype() == tensorflow::DT_RESOURCE) { + tensorflow::ResourceHandle handle = npu_tensor->scalar()(); + status->status = + tensorflow::errors::Internal("Resources ", handle.DebugString(), " cannot be copied across devices[NPU->CPU]"); + return nullptr; + } + + const tensorflow::Tensor *local_tensor; + TFE_TensorHandle *local_handle = tensorflow::wrap( + tensorflow::TensorHandle::CreateLocalHandle(tensorflow::Tensor(npu_tensor->dtype(), npu_tensor->shape()))); + NPU_CTX_REQUIRES_RETURN(status, local_handle != nullptr, tensorflow::errors::Internal("Failed create local handle"), + nullptr); + NPU_CTX_REQUIRES_OK_RETURN(status, npu::UnwrapTensor(local_handle, &local_tensor), nullptr); + NPU_CTX_REQUIRES_OK_RETURN(status, npu::Unwrap(npu_tensor)->AssembleTo(local_tensor), local_handle); + return local_handle; +} + +TFE_TensorHandle *NpuDevice::CopyTensorH2D(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status) { + return CopyTensorH2D(context, tensor, Format::FORMAT_ND, status); +} + +TFE_TensorHandle *NpuDevice::CopyTensorH2D(TFE_Context *context, TFE_TensorHandle *tensor, Format fmt, + TF_Status *status) { + TFE_TensorHandle *local_handle = tensor; + std::vector copied_tensor_handles; + if (!IsCpuTensorHandle(npu::UnwrapHandle(tensor))) { + local_handle = TFE_TensorHandleCopyToDevice(tensor, context, underlying_device.c_str(), status); + copied_tensor_handles.push_back(local_handle); + } + + if (TF_GetCode(status) != TF_OK) return nullptr; + const tensorflow::Tensor *local_tensor = nullptr; + NPU_CTX_REQUIRES_OK_RETURN(status, npu::UnwrapTensor(local_handle, &local_tensor), nullptr); + if (local_tensor->dtype() == tensorflow::DT_RESOURCE) { + tensorflow::ResourceHandle handle = local_tensor->scalar()(); + status->status = + tensorflow::errors::Internal("Resources ", handle.DebugString(), " cannot be copied across devices[CPU->NPU]"); + return nullptr; + } + + TFE_TensorHandle *npu_handle = + NewDeviceTensorHandle(context, fmt, local_tensor->shape(), local_tensor->dtype(), status); + if (TF_GetCode(status) != TF_OK) return nullptr; + const tensorflow::Tensor *npu_tensor = nullptr; + + NPU_CTX_REQUIRES_OK_RETURN(status, npu::UnwrapTensor(npu_handle, &npu_tensor), nullptr); + NPU_CTX_REQUIRES_OK_RETURN(status, npu::Unwrap(npu_tensor)->AssembleFrom(local_tensor), npu_handle); + for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); } + return npu_handle; +} + +tensorflow::Status NpuDevice::InferShape(TFE_Context *context, const tensorflow::OpRegistrationData *op_reg_data, + const tensorflow::NodeDef &ndef, int num_inputs, TFE_TensorHandle **inputs, + TensorPartialShapes &shapes, bool &requested_input_value) { + requested_input_value = false; + NPU_REQUIRES(op_reg_data->shape_inference_fn, + tensorflow::errors::Unimplemented("No infer shape function registered for op ", ndef.op())); + + tensorflow::shape_inference::InferenceContext ic(TF_GRAPH_DEF_VERSION, ndef, op_reg_data->op_def, + std::vector(num_inputs), + {}, {}, {}); + NPU_REQUIRES_OK(ic.construction_status()); + for (int i = 0; i < num_inputs; i++) { + auto input = npu::UnwrapHandle(inputs[i]); + tensorflow::shape_inference::ShapeHandle shape; + NPU_REQUIRES_OK(input->InferenceShape(&ic, &shape)); + ic.SetInput(i, shape); + } + + for (int i = 0; i < num_inputs; i++) { + auto input = inputs[i]; + if (npu::UnwrapHandle(input)->DataType() == tensorflow::DT_RESOURCE) { + const tensorflow::Tensor *tensor; + NPU_REQUIRES_OK(npu::UnwrapTensor(input, &tensor)); + auto handle = tensor->flat()(0); + const auto &dtypes_and_shapes = handle.dtypes_and_shapes(); + std::vector inference_shapes_and_types; + for (auto &dtype_and_shape : dtypes_and_shapes) { + std::vector dims_handle(dtype_and_shape.shape.dims()); + for (size_t j = 0; j < dims_handle.size(); j++) { + dims_handle[j] = ic.MakeDim(dtype_and_shape.shape.dim_size(j)); + } + inference_shapes_and_types.emplace_back(ic.MakeShape(dims_handle), dtype_and_shape.dtype); + } + ic.set_input_handle_shapes_and_types(i, inference_shapes_and_types); + requested_input_value = true; + } + } + // We need to feed the input tensors. TensorFlow performs inference based on the input shape for the first time. + // If the shape function of an operator depends on the value of the input tensor, the shape function is marked for the + // first time and the actual tensor value is used for inference for the second time. + NPU_REQUIRES_OK(ic.Run(op_reg_data->shape_inference_fn)); + + std::vector input_tensors; + input_tensors.resize(num_inputs); + std::vector copied_tensor_handles; + bool input_requested = false; + for (int i = 0; i < num_inputs; i++) { + auto input = inputs[i]; + if (ic.requested_input_tensor(i)) { // If requested, this must be a normal tensor + if (IsNpuTensorHandle(npu::UnwrapHandle(input))) { + auto s = TF_NewStatus(); + if (s == nullptr) { continue; } + input = CopyTensorD2H(context, input, s); + if (TF_GetCode(s) != TF_OK) { + TF_DeleteStatus(s); + continue; + } + DLOG() << "Copying " << ndef.op() << " input:" << i << " from NPU to CPU for infer shape"; + copied_tensor_handles.push_back(input); + } + const tensorflow::Tensor *tensor; + NPU_REQUIRES_OK(npu::UnwrapTensor(input, &tensor)); + input_tensors[i] = tensor; + input_requested = true; + requested_input_value = true; + } + } + if (input_requested) { + ic.set_input_tensors(input_tensors); + NPU_REQUIRES_OK(ic.Run(op_reg_data->shape_inference_fn)); + } + + for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); } + + for (int i = 0; i < ic.num_outputs(); i++) { + shapes.emplace_back(tensorflow::PartialTensorShape()); + tensorflow::shape_inference::ShapeHandle shape_handle = ic.output(i); + auto num_dims = ic.Rank(shape_handle); + std::vector dims; + if (num_dims == tensorflow::shape_inference::InferenceContext::kUnknownRank) { continue; } + for (auto j = 0; j < num_dims; ++j) { dims.emplace_back(ic.Value(ic.Dim(shape_handle, j))); } + NPU_REQUIRES_OK(tensorflow::PartialTensorShape::MakePartialShape(dims.data(), num_dims, &shapes[i])); + } + return tensorflow::Status::OK(); +} + +void NpuDevice::GetOrCreateSpec(TFE_Context *context, const char *op_name, const TFE_OpAttrs *attributes, + int num_inputs, TFE_TensorHandle **inputs, std::shared_ptr *spec, + TF_Status *s) { + tensorflow::NodeDef ndef; + ndef.set_op(op_name); + tensorflow::unwrap(attributes)->FillAttrValueMap(ndef.mutable_attr()); + bool request_shape = false; + GetCachedTaskSpec(ndef, spec, request_shape); + if (request_shape) { + TensorShapes input_shapes; + input_shapes.resize(num_inputs); + for (int i = 0; i < num_inputs; i++) { + NPU_CTX_REQUIRES_OK(s, npu::UnwrapHandle(inputs[i])->Shape(&input_shapes[i])); + } + GetCachedTaskSpec(ndef, input_shapes, spec); + } + if (*spec != nullptr) { + DLOG() << "Found cached task spec for " << op_name; + DLOG() << (*spec)->DebugString(); + return; + } + DLOG() << "No cached task spec for " << op_name << ", start create and cache"; + // 上面校验resource源头的,都是不可以cache的,因为resource可能在多次调用中来自不同的设备,下面的部分是可以cache的 + // NodeDef保存节点的属性,比较重要的,对于单算子,则会保存T属性,表达输入输出的type + // OpRegistrationData保存算子的IR注册信息,对于单算子,则和RegisterOp传递的信息一致,对于function,则是确定了输入的dataType的 + tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef(); + const tensorflow::OpRegistrationData *op_reg_data; + NPU_CTX_REQUIRES_OK(s, lib_def->LookUp(op_name, &op_reg_data)); + bool is_function_op = op_reg_data->is_function_op; + // 判断当前算子是否是NPU Device声明支持的算子 + if (!is_function_op && !Supported(op_name)) { + *spec = CacheOpSpec(op_name, op_reg_data, ndef, {}, tensorflow::strings::StrCat("Op unsupported by NPU")); + return; + } + bool is_stateful = op_reg_data->op_def.is_stateful(); + // 这里获取输出的dataType,对于常规算子,通过NodeDef的T属性确定,对于function op,则是在ret上自带 + TensorDataTypes data_types; + NPU_CTX_REQUIRES_OK(s, tensorflow::OutputTypesForNode(ndef, op_reg_data->op_def, &data_types)); + // 如果输出的dataType不支持,或者不是支持的ResourceGenerator,则fallback + tensorflow::Status compat_status = ValidateOutput(op_name, data_types); + if (!compat_status.ok()) { + if (is_function_op) { + const static uint64_t kInvalidGeGraphId = -1; + *spec = CacheFuncSpec(op_name, op_reg_data, ndef, kInvalidGeGraphId, {}, {}, {}, compat_status.error_message()); + return; + } else { + *spec = CacheOpSpec(op_name, op_reg_data, ndef, {}, compat_status.error_message()); + return; + } + } + // 需要进行函数算子的图优化,然后再判断NPU是否兼容 + if (is_function_op) { // 对function_op,进行图优化,并固定缓存,如果需要fallback,也在spec中记录fallback的原因 + const tensorflow::FunctionDef *fdef = lib_def->Find(op_name); + std::unique_ptr optimize_graph = std::make_unique(lib_def); + std::unique_ptr fbody; + tensorflow::ProcessFunctionLibraryRuntime *pflr = npu::UnwrapCtx(context)->pflr(); + tensorflow::FunctionLibraryRuntime *flr = pflr->GetFLR("/job:localhost/replica:0/task:0/device:CPU:0"); + FunctionDefToBodyHelper(*fdef, tensorflow::AttrSlice(&ndef.attr()), lib_def, &fbody); + CopyGraph(*fbody->graph, optimize_graph.get()); + std::string file_name_suffix = std::string(op_name) + ".pbtxt"; + if (kDumpExecutionDetail || kDumpGraph) { + WriteTextProto(tensorflow::Env::Default(), "step_0_before_optimize_" + file_name_suffix, + optimize_graph->ToGraphDefDebug()); + } + + tensorflow::OptimizeGraph(flr, &optimize_graph); + + if (kDumpExecutionDetail || kDumpGraph) { + WriteTextProto(tensorflow::Env::Default(), "step_1_after_optimize_" + file_name_suffix, + optimize_graph->ToGraphDefDebug()); + } + + std::vector dependent_host_resources; + NPU_CTX_REQUIRES_OK( + s, TransResourceInput2GraphNode(context, optimize_graph.get(), num_inputs, inputs, dependent_host_resources)); + if (kDumpExecutionDetail || kDumpGraph) { + WriteTextProto(tensorflow::Env::Default(), "step_2_after_assemble_resource_node_" + file_name_suffix, + optimize_graph->ToGraphDefDebug()); + } + + PruneFunction(*fdef, optimize_graph.get()); + + // TODO:因为Parser要求打上的属性不是下划线开头,直接标记后,会导致无法回到TF执行,当前先复制一份图用于标记 + std::unique_ptr mark_shape_graph = std::make_unique(lib_def); + CopyGraph(*optimize_graph, mark_shape_graph.get()); + DLOG() << "NPU Start inferring shape for function node " << op_name; + MarkGraphNodeInOutDesc(context, mark_shape_graph.get(), num_inputs, inputs); + FixGraphArgRetvalIndex(mark_shape_graph.get()); // Arg节点可能会被优化掉,因而需要重新排列index,并且prune输入 + + if (kDumpExecutionDetail || kDumpGraph) { + tensorflow::GraphDef gdef; + mark_shape_graph->ToGraphDef(&gdef); + tensorflow::FunctionDefLibrary fdef_lib; + for (const auto &fn : lib_def->ListFunctionNames()) { *fdef_lib.add_function() = *lib_def->Find(fn); } + *gdef.mutable_library() = fdef_lib; + WriteTextProto(tensorflow::Env::Default(), "step_3_after_mark_shape_" + file_name_suffix, gdef); + } + // 因为parser当前约定的附加属性不是匿名属性(非下划线开头,所以这里当前需要拷贝一份新图用于标记parser所需属性) + tensorflow::GraphDef function_graph_def; + mark_shape_graph->ToGraphDef(&function_graph_def); + uint64_t graph_id = + kCustomKernelEnabled ? AddGeGraph(context, std::string("tf_function_") + op_name, function_graph_def, s) : 0; + if (TF_GetCode(s) != TF_OK) return; + + std::vector remain_indexes; + std::vector pruned_inputs; + for (auto node : optimize_graph->nodes()) { + if (node->IsArg()) { + auto index = node->attrs().Find("index")->i(); + remain_indexes.push_back(index); + pruned_inputs.push_back(inputs[index]); + } + } + FixGraphArgRetvalIndex(optimize_graph.get()); // 必须在保存完remain index后fix arg index + DLOG() << std::string("tf_function_") + op_name << " remained input index (0-" << num_inputs - 1 << ") -> " + << VecToString(remain_indexes); + auto lambda = [remain_indexes](int num_inputs, TFE_TensorHandle **inputs, std::vector &pruned) { + for (auto index : remain_indexes) { pruned.push_back(inputs[index]); } + }; + // 对于function节点,可以将resource的输入NPU兼容性作为缓存项目,校验输入是否被NPU支持,如果类型不支持,或者是CPU的Resouce类型,则不支持 + // 如果是单算子,则不能缓存,需要在每次dev->Run的时候,校验单算子资源输入的兼容性 + *spec = + CacheFuncSpec(op_name, op_reg_data, ndef, graph_id, std::move(optimize_graph), lambda, dependent_host_resources, + ValidateInput(op_name, pruned_inputs.size(), pruned_inputs.data()).error_message()); + return; + } else { + // 进行inferShape,输出可能是unknown shape,所以使用partial shape + TensorShapes input_shapes; + input_shapes.resize(num_inputs); + for (int i = 0; i < num_inputs; i++) { + NPU_CTX_REQUIRES_OK(s, npu::UnwrapHandle(inputs[i])->Shape(&input_shapes[i])); + } + TensorPartialShapes partial_shapes; + bool requested_input_value = false; + if (!data_types.empty()) { + DLOG() << "Infer shape for op " << op_name; + tensorflow::Status infer_status = + InferShape(context, op_reg_data, ndef, num_inputs, inputs, partial_shapes, requested_input_value); + // 如果inferShape失败,或者期望输出数量不对,则fallback回CPU,因为CPU的计算并不依赖inferShape + if (!infer_status.ok()) { + *spec = CacheOpSpec(op_name, op_reg_data, ndef, input_shapes, partial_shapes, infer_status.error_message()); + return; + } + } else { + DLOG() << "Skip infer shape for non-output op " << op_name; + } + const std::string reason = ValidateInput(op_name, num_inputs, inputs).error_message(); + if (requested_input_value) { + *spec = CacheOpSpec(op_name, op_reg_data, ndef, input_shapes, reason); + } else { + *spec = CacheOpSpec(op_name, op_reg_data, ndef, input_shapes, partial_shapes, reason); + } + return; + } +} + +void NpuDevice::FallbackCPU(TFE_Context *context, const char *op_name, const TFE_OpAttrs *attributes, int num_inputs, + TFE_TensorHandle **inputs, int *num_outputs, TFE_TensorHandle **outputs, + TF_Status *status) { + DLOG() << "Start fallback executing " << op_name << " by " << underlying_device; + TFE_Op *op(TFE_NewOp(context, op_name, status)); + if (TF_GetCode(status) != TF_OK) return; + TFE_OpAddAttrs(op, attributes); + TFE_OpSetDevice(op, underlying_device.c_str(), status); + std::vector copied_tensor_handles; //最后需要释放掉临时拷贝而来的输入cpu handle + for (int j = 0; j < num_inputs; ++j) { + TFE_TensorHandle *input = inputs[j]; + if (IsNpuTensorHandle(npu::UnwrapHandle(input))) { + input = CopyTensorD2H(context, input, status); // 创建完成计数为1 + copied_tensor_handles.emplace_back(input); + if (TF_GetCode(status) != TF_OK) return; + } + if (kDumpExecutionDetail) { + const tensorflow::Tensor *tensor = nullptr; + npu::UnwrapTensor(input, &tensor); + LOG(INFO) << " input " << j << " " << tensor->DebugString(); + } + TFE_OpAddInput(op, input, status); // add完成计数为2 + if (TF_GetCode(status) != TF_OK) return; + } + + std::vector op_outputs(*num_outputs); + TFE_Execute(op, op_outputs.data(), num_outputs, status); + TFE_DeleteOp(op); + for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); } + if (TF_GetCode(status) != TF_OK) return; + for (int i = 0; i < *num_outputs; ++i) { outputs[i] = op_outputs[i]; } + + NpuFallbackHookFunc *hook = nullptr; + if (CustomKernelRegistry::Instance().GetFallbackHookFunc(op_name, &hook)) { + (*hook)(context, this, op_name, attributes, num_inputs, inputs, *num_outputs, outputs, status); + if (TF_GetCode(status) != TF_OK) return; + } +} + +void NpuDevice::Execute(const TFE_Op *op, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *s) { + auto context = TFE_OpGetContext(op, s); + if (TF_GetCode(s) != TF_OK) { return; } + auto num_inputs = TFE_OpGetFlatInputCount(op, s); + if (TF_GetCode(s) != TF_OK) { return; } + std::vector inputs; + for (int i = 0; i < num_inputs; i++) { + inputs.push_back(TFE_OpGetFlatInput(op, i, s)); + if (TF_GetCode(s) != TF_OK) { return; } + } + auto op_name = TFE_OpGetName(op, s); + if (TF_GetCode(s) != TF_OK) { return; } + auto attributes = TFE_OpGetAttrs(op); + DLOG() << "NPU Start executing " << op_name; + // 如果存在一个算子的输入来自多个设备的情况,需要直接报错 + bool cpu_resource = false; + NPU_CTX_REQUIRES_OK(s, ValidateResourcePlacement(op_name, num_inputs, inputs.data(), cpu_resource)); + // 如果算子有resource输入来自CPU,则必须fallback CPU + if (cpu_resource) { + DLOG() << "NPU Executing " << op_name << " fallback[input resource from cpu]"; + FallbackCPU(context, op_name, attributes, inputs.size(), inputs.data(), num_outputs, outputs, s); + return; + } + std::shared_ptr spec; + GetOrCreateSpec(context, op_name, attributes, inputs.size(), inputs.data(), &spec, s); + if (TF_GetCode(s) != TF_OK) { return; } + DLOG() << "NPU Executing " << op_name << " found cached spec " << spec->DebugString(); + if (spec->ShouldFallback()) { + DLOG() << "NPU Executing " << op_name << " fallback[" << spec->FallbackReason() << "]"; + FallbackCPU(context, op_name, attributes, inputs.size(), inputs.data(), num_outputs, outputs, s); + if (TF_GetCode(s) != TF_OK) { + LOG(ERROR) << "NPU Executing " << op_name << " fallback failed"; + std::stringstream ss; + ss << spec->DebugString() << std::endl; + for (int i = 0; i < num_inputs; i++) { + tensorflow::Status status; + const tensorflow::Tensor *tensor = nullptr; + npu::UnwrapHandle(inputs[i])->DeviceName(&status); + npu::UnwrapTensor(inputs[i], &tensor); + ss << "input " << i << " " << tensorflow::DataTypeString(tensor->dtype()) << " device " + << npu::UnwrapHandle(inputs[i])->DeviceName(&status) << std::endl; + } + LOG(ERROR) << ss.str(); + } + } else { + DLOG() << "NPU Executing " << op_name << " dispatched to npu executor"; + Run(context, spec, inputs.size(), inputs.data(), num_outputs, outputs, s); + } +} + +void NpuDevice::Run(TFE_Context *context, std::shared_ptr spec, int num_inputs, + TFE_TensorHandle **inputs, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status) { + if (spec->IsFunctionOp()) { + DLOG() << "NPU Executor start executing function op " << spec->Op(); + RunGraph(context, reinterpret_cast(spec.get()), num_inputs, inputs, num_outputs, outputs, + status); + } else { + DLOG() << "NPU Executor start executing normal op " << spec->Op(); + RunOp(context, reinterpret_cast(spec.get()), num_inputs, inputs, num_outputs, outputs, status); + } +} + +void NpuDevice::RunOp(TFE_Context *context, const npu::OpSpec *spec, int num_inputs, TFE_TensorHandle **inputs, + int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status) { + TensorShapes output_shapes; + tensorflow::NodeDef parser_ndef = spec->ParserNodeDef(); + if (spec->ShouldInferShape()) { + DLOG() << "NPU Executing op " << spec->Op() << " need re-infer shape"; + TensorPartialShapes partial_shapes; + bool unused = false; + bool should_fallback = + !InferShape(context, spec->OpRegistrationData(), spec->NodeDef(), num_inputs, inputs, partial_shapes, unused) + .ok(); + if (!should_fallback) { + output_shapes.resize(partial_shapes.size()); + for (size_t i = 0; i < partial_shapes.size(); i++) { + DLOG() << "NPU Executing op " << spec->Op() << " re-infer shape output " << i + << partial_shapes[i].DebugString(); + if (!partial_shapes[i].AsTensorShape(&output_shapes[i])) { + should_fallback = true; + break; + } + } + } + if (should_fallback) { + DLOG() << "NPU Executing op " << spec->Op() << " fallback cpu after re-infer shape"; + tensorflow::AttrBuilder attr_builder; + attr_builder.Reset(spec->Op().c_str()); + attr_builder.BuildNodeDef(); + auto attrs = spec->NodeDef().attr(); + for (auto &attr : attrs) { attr_builder.Set(attr.first, attr.second); } + FallbackCPU(context, spec->Op().c_str(), tensorflow::wrap(&attr_builder), num_inputs, inputs, num_outputs, + outputs, status); + return; + } + AssembleOutputDesc(output_shapes, spec->OutputTypes(), &parser_ndef); + } else { + output_shapes = spec->OutputShapes(); + } + + if (kCustomKernelEnabled) { + NpuCustomKernelFunc *custom_kernel = nullptr; + if (CustomKernelRegistry::Instance().GetCustomKernelFunc(spec->Op(), &custom_kernel)) { + (*custom_kernel)(context, this, spec, output_shapes, parser_ndef, num_inputs, inputs, *num_outputs, outputs, + status); + return; + } + } + + // 输入如果是CPU,此时要转换成NPU + std::vector npu_inputs(num_inputs); + std::vector copied_tensor_handles; + for (int i = 0; i < num_inputs; ++i) { + TFE_TensorHandle *input = inputs[i]; + // 到达这里的Resource,要么是CPU的镜像 要么是NPU + if (!IsNpuTensorHandle(npu::UnwrapHandle(input)) + && npu::UnwrapHandle(input)->DataType() != tensorflow::DT_RESOURCE) { + tensorflow::Status s; + auto src_name = npu::UnwrapHandle(input)->DeviceName(&s); + NPU_CTX_REQUIRES_OK(status, s); + DLOG() << "Copying " << spec->Op() << " input:" << i + << " type:" << tensorflow::DataTypeString(npu::UnwrapHandle(input)->DataType()) << " to NPU from " + << src_name << " for acl executing"; + // 这里需要根据算子选择输入格式了 + input = CopyTensorH2D(context, input, Format::FORMAT_ND, status); + copied_tensor_handles.emplace_back(input); + if (TF_GetCode(status) != TF_OK) return; + } + npu_inputs[i] = input; + } + const auto &output_types = spec->OutputTypes(); + for (size_t i = 0; i < output_types.size(); ++i) { + if (output_types[i] == tensorflow::DT_RESOURCE) { + outputs[i] = NewDeviceResourceHandle(context, output_shapes[i], status); + if (TF_GetCode(status) != TF_OK) { return; } + } else { + outputs[i] = NewDeviceTensorHandle(context, Format::FORMAT_ND, output_shapes[i], output_types[i], status); + if (TF_GetCode(status) != TF_OK) { return; } + } + } + /******************************************模拟NPU执行Start************************************/ + // TODO:下面换成真实的ACL调用即可,当前直接FallbackCPU + // npu_inputs 指向NPU内存的TFE_TensorHandle** + // outputs 指向NPU内存的TFE_TensorHandle** + // parser_ndef 打了输入输出描述的ndef,需要优化,后续直接存储ACL的结构体 + // copied_tensor_handles 存储临时申请的TFE_TensorHandle对象,除输入输出外,必须在最后显式释放 + // output_shapes 临时变量,算子的输出shape + // spec 待运算算子的说明信息,必定包含InputShapes(),InputTypes(),OutputTypes(),不一定包含OutputShapes()(因为有的算子inferShape依赖输入的值(如reshape),输出shape需要使用上面的output_shapes临时变量) + + /* + 从TFE_TensorHandle*获取NpuManagedBuffer: + const tensorflow::Tensor *npu_tensor = nullptr; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(npu_inputs[i], &npu_tensor)); + npu::Unwrap(npu_tensor); // 返回值就是NpuManagedBuffer* + */ + std::vector acl_inputs(num_inputs); + for (int i = 0; i < num_inputs; ++i) { + const tensorflow::Tensor *npu_tensor = nullptr; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(npu_inputs[i], &npu_tensor)); + tensorflow::Tensor cpu_tensor(npu_tensor->dtype(), npu_tensor->shape()); + if (npu_tensor->dtype() == tensorflow::DT_RESOURCE) { + for (int j = 0; j < npu_tensor->NumElements(); j++) { + cpu_tensor.flat()(j) = + const_cast(npu_tensor)->flat()(j); + } + } else { + NPU_CTX_REQUIRES_OK(status, npu::Unwrap(npu_tensor)->AssembleTo(&cpu_tensor)); + } + acl_inputs[i] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(cpu_tensor)); + copied_tensor_handles.push_back(acl_inputs[i]); + if (TF_GetCode(status) != TF_OK) return; + } + /**********调用CPU模拟NPU Start*************/ + std::vector acl_outputs(*num_outputs); + tensorflow::AttrBuilder attr_builder; + attr_builder.Reset(spec->Op().c_str()); + attr_builder.BuildNodeDef(); + auto attrs = spec->NodeDef().attr(); + for (auto &attr : attrs) { attr_builder.Set(attr.first, attr.second); } + + FallbackCPU(context, spec->Op().c_str(), tensorflow::wrap(&attr_builder), num_inputs, acl_inputs.data(), num_outputs, + acl_outputs.data(), status); + if (TF_GetCode(status) != TF_OK) return; + /**********调用CPU模拟NPU End*************/ + for (int i = 0; i < *num_outputs; ++i) { + const tensorflow::Tensor *acl_tensor = nullptr; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(acl_outputs[i], &acl_tensor)); + const tensorflow::Tensor *npu_tensor = nullptr; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(outputs[i], &npu_tensor)); + if (spec->OutputTypes()[i] == tensorflow::DT_RESOURCE) { + for (int j = 0; j < npu_tensor->NumElements(); j++) { + const_cast(npu_tensor)->flat()(j) = + acl_tensor->flat()(j); + } + } else { + NPU_CTX_REQUIRES_OK(status, npu::Unwrap(npu_tensor)->AssembleFrom(acl_tensor)); + } + TFE_DeleteTensorHandle(acl_outputs[i]); + if (TF_GetCode(status) != TF_OK) return; + } + /******************************************模拟NPU执行End************************************/ + DLOG() << "NPU Executing op " << spec->Op() << " succeed by npu excutor"; + for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); } // 计数-2 +} + +void NpuDevice::RunGraph(TFE_Context *context, const npu::FuncSpec *spec, int tf_num_inputs, + TFE_TensorHandle **tf_inputs, int *num_outputs, TFE_TensorHandle **outputs, + TF_Status *status) { + std::vector pruned_inputs; + spec->PruneInputs(tf_num_inputs, tf_inputs, pruned_inputs); + int num_inputs = pruned_inputs.size(); + TFE_TensorHandle **inputs = pruned_inputs.data(); + // 注意,因为GE当前执行图的时候,输入输出内存都是Host的,所以这里和ACL执行相反,如果输入是NPU,则需要转回CPU,特别的,对于资源类,当前采取的策略是资源入图 + // 输入如果是NPU,此时要转换成CPU + std::vector npu_inputs(num_inputs); + std::vector copied_tensor_handles; + for (int i = 0; i < num_inputs; ++i) { + TFE_TensorHandle *input = inputs[i]; + // 到达这里的Resource,要么是CPU的镜像 要么是NPU + if (IsNpuTensorHandle(npu::UnwrapHandle(input)) + && npu::UnwrapHandle(input)->DataType() != tensorflow::DT_RESOURCE) { + tensorflow::Status tf_status; + auto src_name = npu::UnwrapHandle(input)->DeviceName(&tf_status); + NPU_CTX_REQUIRES_OK(status, tf_status); + DLOG() << "Copying " << spec->Op() << " input:" << i + << " type:" << tensorflow::DataTypeString(npu::UnwrapHandle(input)->DataType()) << " to " << src_name + << " from NPU for graph engine executing"; + // 这里需要根据算子选择输入格式了 + input = CopyTensorD2H(context, input, status); + copied_tensor_handles.emplace_back(input); + if (TF_GetCode(status) != TF_OK) return; + } + npu_inputs[i] = input; + } + + if (kCustomKernelEnabled) { + // TODO:这里根据小循环策略修改值 + int64_t iterations_per_loop = kGlobalLoopSize; + size_t num_dependent_resources = spec->DependentHostResources().size(); + for (const auto &resource : spec->DependentHostResources()) { + LOG(INFO) << "Start consume iterator resource " << resource.name() << " " << iterations_per_loop << " times"; + // 注意,这个callback不能引用捕获,防止中途因为消费某个资源失败而导致coredump + auto done = [resource, iterations_per_loop](const tensorflow::Status &s) { + LOG(INFO) << "Iterator resource " << resource.name() << " consume " << iterations_per_loop + << " times done with status " << s.ToString(); + }; + NPU_CTX_REQUIRES_OK(status, ConsumeIteratorAsync(resource, iterations_per_loop, done)); + } + LOG(INFO) << "Start run ge graph " << spec->GeGraphId() << " pin to cpu, loop size " << iterations_per_loop; + npu::Timer timer("Graph engine run ", iterations_per_loop, " times for graph ", spec->GeGraphId()); + timer.Start(); + RunGeGraphPin2Cpu(context, spec->GeGraphId(), num_inputs, inputs, spec->OutputTypes(), *num_outputs, outputs, + status); + timer.Stop(); + return; + } + /******************************************模拟NPU执行Start************************************/ + // TODO:下面换成真实的GE调用即可,当前直接FallbackCPU + // inputs 指向CPU内存的TFE_TensorHandle** + // copied_tensor_handles 存储临时申请的TFE_TensorHandle对象,除输入输出外,必须在最后显式释放 + // output_shapes 临时变量,算子的输出shape + // spec 待运算算子的说明信息,必定包含InputShapes(),InputTypes(),OutputTypes(),Graph(),GeGraphId(),不包含OutputShapes() + + std::vector acl_inputs(num_inputs); + for (int i = 0; i < num_inputs; ++i) { + if (IsNpuTensorHandle(npu::UnwrapHandle(npu_inputs[i])) + && npu::UnwrapHandle(npu_inputs[i])->DataType() == tensorflow::DT_RESOURCE) { + const tensorflow::Tensor *npu_tensor = nullptr; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(npu_inputs[i], &npu_tensor)); + tensorflow::Tensor cpu_tensor(npu_tensor->dtype(), npu_tensor->shape()); + for (int j = 0; j < npu_tensor->NumElements(); j++) { + cpu_tensor.flat()(j) = + const_cast(npu_tensor)->flat()(j); + } + acl_inputs[i] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(cpu_tensor)); + copied_tensor_handles.push_back(acl_inputs[i]); + } else { + acl_inputs[i] = npu_inputs[i]; + } + } + /**********调用CPU模拟NPU Start*************/ + std::vector acl_outputs(*num_outputs); + tensorflow::FunctionDef optimized_fdef; + tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef(); + auto fdef = lib_def->Find(spec->Op()); + auto lookup = [&fdef](const tensorflow::Node *node) -> absl::optional { + for (const auto &control_ret : fdef->control_ret()) { + if (control_ret.second == node->name()) { return absl::make_optional(node->name()); } + } + return absl::nullopt; + }; + std::string acl_op_name = std::string(spec->Op()) + "_npu_optimized"; + tensorflow::GraphToFunctionDef(*spec->Graph(), acl_op_name, lookup, &optimized_fdef); + lib_def->RemoveFunction(acl_op_name); + lib_def->AddFunctionDef(optimized_fdef); + + tensorflow::AttrBuilder attr_builder; + attr_builder.Reset(spec->Op().c_str()); + attr_builder.BuildNodeDef(); + auto attrs = spec->NodeDef().attr(); + for (auto &attr : attrs) { attr_builder.Set(attr.first, attr.second); } + + FallbackCPU(context, acl_op_name.c_str(), tensorflow::wrap(&attr_builder), num_inputs, acl_inputs.data(), num_outputs, + acl_outputs.data(), status); + if (TF_GetCode(status) != TF_OK) return; + /**********调用CPU模拟NPU End*************/ + for (int i = 0; i < *num_outputs; ++i) { + const tensorflow::Tensor *acl_tensor = nullptr; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(acl_outputs[i], &acl_tensor)); + /**********回调Start*********/ + if (acl_tensor->dtype() == tensorflow::DT_RESOURCE) { + outputs[i] = NewDeviceResourceHandle(context, acl_tensor->shape(), status); + if (TF_GetCode(status) != TF_OK) { return; } + } else { + outputs[i] = NewDeviceTensorHandle(context, Format::FORMAT_ND, acl_tensor->shape(), acl_tensor->dtype(), status); + if (TF_GetCode(status) != TF_OK) { return; } + } + /**********回调End*********/ + const tensorflow::Tensor *npu_tensor = nullptr; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(outputs[i], &npu_tensor)); + if (acl_tensor->dtype() == tensorflow::DT_RESOURCE) { + for (int j = 0; j < npu_tensor->NumElements(); j++) { + const_cast(npu_tensor)->flat()(j) = + acl_tensor->flat()(j); + } + } else { + NPU_CTX_REQUIRES_OK(status, npu::Unwrap(npu_tensor)->AssembleFrom(acl_tensor)); + } + TFE_DeleteTensorHandle(acl_outputs[i]); + if (TF_GetCode(status) != TF_OK) return; + } + /******************************************模拟NPU执行End************************************/ + DLOG() << "NPU Executing function op " << spec->Op() << " succeed by npu executor"; + for (auto handle : copied_tensor_handles) { TFE_DeleteTensorHandle(handle); } // 计数-2 +} + +void NpuDevice::RunGeGraphAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, + bool pin_to_npu, const TensorDataTypes &output_types, int num_outputs, + TFE_TensorHandle **outputs, DoneCallback done, TF_Status *status) { + std::vector ge_inputs; + + DLOG() << "Ge graph " << graph_id << " input info"; + for (int i = 0; i < num_inputs; i++) { + const tensorflow::Tensor *tensor = nullptr; + npu::UnwrapTensor(inputs[i], &tensor); + + const static std::shared_ptr parser = + domi::ModelParserFactory::Instance()->CreateModelParser(domi::FrameworkType::TENSORFLOW); + if (parser == nullptr) { + status->status = tensorflow::errors::Internal("NPU Create new tensorflow model parser failed"); + return; + } + ge::DataType ge_type = parser->ConvertToGeDataType(static_cast(tensor->dtype())); + NPU_CTX_REQUIRES(status, ge_type != ge::DT_UNDEFINED, + tensorflow::errors::InvalidArgument("Failed map tensorflow data type ", + tensorflow::DataTypeString(tensor->dtype()), + " to ge data type")); + ge::InputTensorInfo input; + input.data_type = static_cast(ge_type); + for (auto dim_size : tensor->shape().dim_sizes()) { input.dims.emplace_back(dim_size); } + input.data = const_cast(tensor->tensor_data().data()); + input.length = tensor->TotalBytes(); + ge_inputs.emplace_back(input); + DLOG() << " input " << i << " ge enum " << input.data_type << " tf type " + << tensorflow::DataTypeString(tensor->dtype()) << VecToString(input.dims); + } + auto ge_callback = [&, graph_id](ge::Status s, std::vector &ge_outputs) { + if (s == ge::END_OF_SEQUENCE) { + done(tensorflow::errors::OutOfRange("Graph engine process graph ", graph_id, " reach end of sequence")); + return; + } else if (s != ge::SUCCESS) { + std::string err_msg = ge::StatusFactory::Instance()->GetErrDesc(s); + if (err_msg.empty()) { err_msg = " code:" + std::to_string(s); } + done(tensorflow::errors::Internal("Graph engine process graph failed: ", err_msg)); + return; + } else if (ge_outputs.size() != num_outputs) { + done(tensorflow::errors::Internal("Graph engine process graph succeed but output num ", ge_outputs.size(), + " mismatch with expected ", num_outputs)); + return; + } + + DLOG() << "Ge graph " << graph_id << " output info"; + for (size_t i = 0; i < ge_outputs.size(); i++) { + auto &ge_tensor = ge_outputs[i]; + std::vector dims; + for (auto dim_size : ge_tensor.dims) { dims.push_back(dim_size); } + tensorflow::TensorShape shape; + tensorflow::Status tf_status = tensorflow::TensorShapeUtils::MakeShape(dims.data(), dims.size(), &shape); + if (!tf_status.ok()) { + done(tensorflow::errors::Internal("Graph engine process graph succeed but output ", i, " dims invalid ", + VecToString(ge_tensor.dims), " ", tf_status.error_message())); + return; + } + DLOG() << " output " << i << " ge type enum " << ge_tensor.data_type << " tf type " + << tensorflow::DataTypeString(output_types[i]) << shape.DebugString(); + + const static int64_t kTensorAlignBytes = 64; + if (reinterpret_cast(ge_tensor.data.get()) % kTensorAlignBytes == 0) { + DLOG() << "Zero copy ge tensor " << reinterpret_cast(ge_tensor.data.get()) << " as aligned with " + << kTensorAlignBytes << " bytes"; + tensorflow::Allocator *allocator = NpuHostFixedAllocator::Create(std::move(ge_tensor.data)); + tensorflow::Tensor cpu_tensor(allocator, output_types[i], shape); + if (ge_tensor.length != cpu_tensor.TotalBytes()) { + done(tensorflow::errors::Internal("Graph engine process graph succeed but output ", i, " total bytes ", + ge_tensor.length, " mismatch with expected ", cpu_tensor.TotalBytes())); + return; + } + outputs[i] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(cpu_tensor)); + } else { + DLOG() << "Skip zero copy as ge tensor " << reinterpret_cast(ge_tensor.data.get()) + << " not aligned with " << kTensorAlignBytes << " bytes"; + tensorflow::Tensor cpu_tensor(output_types[i], shape); + if (ge_tensor.length != cpu_tensor.TotalBytes()) { + done(tensorflow::errors::Internal("Graph engine process graph succeed but output ", i, " total bytes ", + ge_tensor.length, " mismatch with expected ", cpu_tensor.TotalBytes())); + return; + } + memcpy(const_cast(cpu_tensor.tensor_data().data()), ge_tensor.data.get(), ge_tensor.length); + outputs[i] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(cpu_tensor)); + } + + if (pin_to_npu) { + TFE_TensorHandle *handle = outputs[i]; + outputs[i] = CopyTensorH2D(context, handle, status); + TFE_DeleteTensorHandle(handle); + if (TF_GetCode(status) != TF_OK) { + done(tensorflow::Status(status->status.code(), + std::string("Graph engine process graph succeed but copy output ") + std::to_string(i) + + " to npu failed " + status->status.error_message())); + return; + } + } + } + done(tensorflow::Status::OK()); + }; + NPU_CTX_REQUIRES_GE_OK(status, "NPU Schedule graph to graph engine", + ge_session_->RunGraphAsync(graph_id, ge_inputs, ge_callback)); +} + +uint64_t NpuDevice::AddGeGraph(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &def, + TF_Status *status) { + uint64_t graph_id = NextUUID(); + auto ge_compute_graph = std::make_shared(name); + std::shared_ptr parser = + domi::ModelParserFactory::Instance()->CreateModelParser(domi::FrameworkType::TENSORFLOW); + if (parser == nullptr) { + status->status = tensorflow::errors::Internal("NPU Create new tensorflow model parser failed"); + return graph_id; + } + + auto request_subgraph = [this, name, context](const google::protobuf::Message *root_proto, + const std::string &fn) -> std::unique_ptr { + DLOG() << "Tensorflow model parser requesting subgraph " << fn << " for ge graph " << name; + tensorflow::FunctionLibraryDefinition *lib_def = npu::UnwrapCtx(context)->FuncLibDef(); + const tensorflow::FunctionDef *fdef = lib_def->Find(fn); + if (fdef == nullptr) { return nullptr; } + std::unique_ptr fbody; + auto status = FunctionDefToBodyHelper(*fdef, tensorflow::AttrSlice{}, lib_def, &fbody); + if (!status.ok()) { + LOG(ERROR) << "Failed trans function body to graph"; + return nullptr; + } + + tensorflow::ProcessFunctionLibraryRuntime *pflr = npu::UnwrapCtx(context)->pflr(); + tensorflow::FunctionLibraryRuntime *flr = pflr->GetFLR("/job:localhost/replica:0/task:0/device:CPU:0"); + + std::unique_ptr graph = std::make_unique(lib_def); + CopyGraph(*fbody->graph, graph.get()); + tensorflow::OptimizeGraph(flr, &graph); + + PruneFunction(*fdef, graph.get()); + + MarkGraphNodeInOutDesc(context, graph.get(), 0, nullptr); + std::unique_ptr subgraph; + subgraph.reset(new (std::nothrow) tensorflow::GraphDef()); + if (subgraph != nullptr) { graph->ToGraphDef(reinterpret_cast(subgraph.get())); } + if (kDumpExecutionDetail || kDumpGraph) { + WriteTextProto(tensorflow::Env::Default(), name + "_subgraph_" + fn + ".pbtxt", *subgraph); + } + return subgraph; + }; + + NPU_CTX_REQUIRES_GE_OK_RETURN(status, "NPU Parse tensorflow model", + parser->ParseProtoWithSubgraph(&def, request_subgraph, ge_compute_graph), graph_id); + + ge::Graph ge_graph = ge::GraphUtils::CreateGraphFromComputeGraph(ge_compute_graph); + NPU_CTX_REQUIRES_GE_OK_RETURN(status, "Graph engine Add graph", GeSession()->AddGraph(graph_id, ge_graph), graph_id); + return graph_id; +} + +void NpuDevice::RemoveGeGraph(TFE_Context *context, uint64_t graph_id, TF_Status *status) { + NPU_CTX_REQUIRES_GE_OK(status, "Graph engine Remove graph", GeSession()->RemoveGraph(graph_id)); +} + +void NpuDevice::RunGeGraph(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, + bool pin_to_npu, const TensorDataTypes &output_types, int num_outputs, + TFE_TensorHandle **outputs, TF_Status *status) { + tensorflow::Notification notification; + auto done = [status, ¬ification](tensorflow::Status s) { + status->status = std::move(s); + notification.Notify(); + }; + RunGeGraphAsync(context, graph_id, num_inputs, inputs, pin_to_npu, output_types, num_outputs, outputs, done, status); + notification.WaitForNotification(); +} + +void NpuDevice::RunGeGraphPin2CpuAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, + TFE_TensorHandle **inputs, const TensorDataTypes &output_types, int num_outputs, + TFE_TensorHandle **outputs, DoneCallback done, TF_Status *status) { + RunGeGraphAsync(context, graph_id, num_inputs, inputs, false, output_types, num_outputs, outputs, std::move(done), + status); +} + +void NpuDevice::RunGeGraphPin2NpuAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, + TFE_TensorHandle **inputs, const TensorDataTypes &output_types, int num_outputs, + TFE_TensorHandle **outputs, DoneCallback done, TF_Status *status) { + RunGeGraphAsync(context, graph_id, num_inputs, inputs, true, output_types, num_outputs, outputs, std::move(done), + status); +} + +void NpuDevice::RunGeGraphPin2Cpu(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, + const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs, + TF_Status *status) { + RunGeGraph(context, graph_id, num_inputs, inputs, false, output_types, num_outputs, outputs, status); +} + +void NpuDevice::RunGeGraphPin2Npu(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, + const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs, + TF_Status *status) { + RunGeGraph(context, graph_id, num_inputs, inputs, true, output_types, num_outputs, outputs, status); +} + +void NpuDevice::RunGeGraphAnonymous(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &gdef, + int num_inputs, TFE_TensorHandle **inputs, bool pin_to_npu, int num_outputs, + TFE_TensorHandle **outputs, TF_Status *status) { + uint64_t graph_id = AddGeGraph(context, name, gdef, status); + if (TF_GetCode(status) != TF_OK) return; + + std::map indexed_types; + + for (const auto &node : gdef.node()) { + if (node.op() == "_Retval") { + tensorflow::DataType type; + tensorflow::GetNodeAttr(node, "T", &type); + int index; + tensorflow::GetNodeAttr(node, "index", &index); + indexed_types[index] = type; + } + } + TensorDataTypes types; + for (auto indexed_type : indexed_types) { types.emplace_back(indexed_type.second); } + + RunGeGraph(context, graph_id, num_inputs, inputs, pin_to_npu, types, num_outputs, outputs, status); + if (TF_GetCode(status) != TF_OK) return; + + RemoveGeGraph(context, graph_id, status); + if (TF_GetCode(status) != TF_OK) return; +} + +void NpuDevice::RunGeGraphPin2CpuAnonymous(TFE_Context *context, const std::string &name, + const tensorflow::GraphDef &gdef, int num_inputs, TFE_TensorHandle **inputs, + int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) { + RunGeGraphAnonymous(context, name, gdef, num_inputs, inputs, false, num_outputs, outputs, status); +} + +void NpuDevice::RunGeGraphPin2NpuAnonymous(TFE_Context *context, const std::string &name, + const tensorflow::GraphDef &gdef, int num_inputs, TFE_TensorHandle **inputs, + int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) { + RunGeGraphAnonymous(context, name, gdef, num_inputs, inputs, true, num_outputs, outputs, status); +} + +void NpuDevice::GetCachedTaskSpec(const tensorflow::NodeDef &ndef, std::shared_ptr *spec, + bool &request_shape) { + *spec = nullptr; + const auto &op = ndef.op(); + if (cached_func_specs_.find(op) == cached_func_specs_.end()) { + HashKey attr_hash = Hash(ndef); + request_shape = cached_op_specs_.count(op) && cached_op_specs_[op].count(attr_hash); + return; + } + *spec = cached_func_specs_[op]; +} + +void NpuDevice::GetCachedTaskSpec(const tensorflow::NodeDef &ndef, const TensorShapes &shapes, + std::shared_ptr *spec) { + *spec = nullptr; + bool request_shape = false; + GetCachedTaskSpec(ndef, spec, request_shape); + if (*spec != nullptr) { return; } + if (!request_shape) { return; } + HashKey attr_hash = Hash(ndef); + HashKey shape_hash = Hash(shapes); + const auto &op = ndef.op(); + if (cached_op_specs_.count(op) && cached_op_specs_[op].count(attr_hash) + && cached_op_specs_[op][attr_hash].count(shape_hash)) { + *spec = cached_op_specs_[op][attr_hash][shape_hash]; + } +} + +std::shared_ptr +NpuDevice::CacheFuncSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, const tensorflow::NodeDef &ndef, + uint64_t ge_graph_id, std::unique_ptr graph, + const npu::FuncSpec::PruneInputsFunc &prune_func, + const std::vector &dependent_host_resources, + const std::string &reason) { + auto spec = std::make_shared(op_spec, ndef, ge_graph_id, std::move(graph), prune_func, + dependent_host_resources, reason); + cached_func_specs_[op] = spec; + DLOG() << "Cache function op spec " << spec->DebugString(); + return spec; +} + +std::shared_ptr +NpuDevice::CacheOpSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, const tensorflow::NodeDef &ndef, + const TensorShapes &input_shapes, const TensorPartialShapes &output_shapes, + const std::string &reason) { + auto spec = std::make_shared(op_spec, ndef, input_shapes, output_shapes, reason); + cached_op_specs_[op][Hash(ndef)][Hash(input_shapes)] = spec; + DLOG() << "Cache op spec " << spec->DebugString(); + return spec; +} + +std::shared_ptr +NpuDevice::CacheOpSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, const tensorflow::NodeDef &ndef, + const TensorShapes &input_shapes, const std::string &reason) { + auto spec = std::make_shared(op_spec, ndef, input_shapes, reason); + cached_op_specs_[op][Hash(ndef)][Hash(input_shapes)] = spec; + DLOG() << "Cache op spec " << spec->DebugString(); + return spec; +} + +bool NpuDevice::Supported(const std::string &op) { + const static std::unordered_set kUnsupportedOps = {}; + return kUnsupportedOps.count(op) == 0; +} + +bool NpuDevice::SupportedResourceGenerator(const std::string &op) { + const static std::unordered_set kUnsupportedOps = {"VarHandleOp"}; + return kUnsupportedOps.count(op) != 0; +} + +void NpuDevice::RecordIteratorMirror(const tensorflow::ResourceHandle &src, const TensorPartialShapes &shapes, + const TensorDataTypes &types) { + iterator_mirrors_.emplace(src, std::make_pair(shapes, types)); +} + +bool NpuDevice::MirroredIterator(const tensorflow::ResourceHandle &src) { + return iterator_mirrors_.find(src) != iterator_mirrors_.end(); +} + +bool NpuDevice::Mirrored(const tensorflow::ResourceHandle &src) { + // TODO:可能后续还有其他需要mirror的资源,外层判断资源兼容时务必使用这个接口 + return iterator_mirrors_.find(src) != iterator_mirrors_.end(); +} + +tensorflow::Status NpuDevice::GetMirroredIteratorShapesAndTypes(const tensorflow::ResourceHandle &src, + TensorPartialShapes &shapes, TensorDataTypes &types) { + auto iter = iterator_mirrors_.find(src); + if (iter == iterator_mirrors_.end()) { + return tensorflow::errors::Internal("Resource ", src.DebugString(), " has not been mirrored"); + } + shapes.assign(iter->second.first.begin(), iter->second.first.end()); + types.assign(iter->second.second.begin(), iter->second.second.end()); + return tensorflow::Status::OK(); +} diff --git a/tf_adapter_2.x/npu_device/core/npu_device.h b/tf_adapter_2.x/npu_device/core/npu_device.h new file mode 100644 index 0000000000000000000000000000000000000000..012c2d76f174455b9d45e354609a7cc2759b5b7c --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_device.h @@ -0,0 +1,227 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_DEVICE_H +#define TENSORFLOW_NPU_DEVICE_H + +#include + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" + +#include "framework/omg/parser/model_parser.h" +#include "framework/omg/parser/parser_factory.h" +#include "ge/ge_api.h" + +#include "npu_cache_spec.h" +#include "npu_dp.h" +#include "npu_types.h" +#include "npu_unwrap.h" +#include "npu_utils.h" + +class NpuDevice { + using HashKey = uint64_t; + + using ShapeTasks = std::map>; + using AttrTasks = std::map; + using CachedOpSpecs = std::map; + using CachedFuncSpecs = std::map>; + using DoneCallback = std::function; + + public: + static std::string CreateDevice(const char *name, int device_index, + const std::map &session_options, NpuDevice **device); + + static void DeleteDevice(void *device); + + void ReleaseResource(); + + tensorflow::Status ValidateResourcePlacement(const char *op_name, int num_inputs, TFE_TensorHandle **inputs, + bool &cpu_resource); + + tensorflow::Status ValidateInput(const char *op_name, int num_inputs, TFE_TensorHandle **inputs); + + tensorflow::Status InferShape(TFE_Context *context, const tensorflow::OpRegistrationData *op_reg_data, + const tensorflow::NodeDef &ndef, int num_inputs, TFE_TensorHandle **inputs, + TensorPartialShapes &shapes, bool &requested_input_value); + + tensorflow::Status ValidateOutput(const char *op_name, const TensorDataTypes &data_types); + + void PruneFunction(const tensorflow::FunctionDef &fdef, tensorflow::Graph *g, bool keep_signature = false); + + void FixGraphArgRetvalIndex(tensorflow::Graph *graph); + + tensorflow::Status TransResourceInput2GraphNode(TFE_Context *context, tensorflow::Graph *graph, int num_inputs, + TFE_TensorHandle **inputs, + std::vector &dependent_host_resources); + + tensorflow::Status MarkGraphNodeInOutDesc(TFE_Context *context, tensorflow::Graph *graph, int num_inputs, + TFE_TensorHandle **inputs); + + TFE_TensorHandle *NewDeviceTensorHandle(TFE_Context *context, ge::Format fmt, const tensorflow::TensorShape &shape, + tensorflow::DataType type, TF_Status *status); + + TFE_TensorHandle *NewDeviceResourceHandle(TFE_Context *context, const tensorflow::TensorShape &shape, + TF_Status *status); + + TFE_TensorHandle *CopyTensorD2H(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status); + + TFE_TensorHandle *CopyTensorH2D(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status); + + TFE_TensorHandle *CopyTensorH2D(TFE_Context *context, TFE_TensorHandle *tensor, ge::Format fmt, TF_Status *status); + + void GetOrCreateSpec(TFE_Context *context, const char *op_name, const TFE_OpAttrs *attributes, int num_inputs, + TFE_TensorHandle **inputs, std::shared_ptr *spec, TF_Status *s); + + void FallbackCPU(TFE_Context *context, const char *op_name, const TFE_OpAttrs *attributes, int num_inputs, + TFE_TensorHandle **inputs, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status); + + // NPU Device对外的顶层方法 + void Execute(const TFE_Op *op, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *s); + + void Run(TFE_Context *context, std::shared_ptr spec, int num_inputs, TFE_TensorHandle **inputs, + int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status); + + void RunOp(TFE_Context *context, const npu::OpSpec *spec, int num_inputs, TFE_TensorHandle **inputs, int *num_outputs, + TFE_TensorHandle **outputs, TF_Status *status); + + void RunGraph(TFE_Context *context, const npu::FuncSpec *spec, int num_inputs, TFE_TensorHandle **inputs, + int *num_outputs, TFE_TensorHandle **outputs, TF_Status *status); + + void RunGeGraphAnonymous(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &gdef, + int num_inputs, TFE_TensorHandle **inputs, bool pin_to_npu, int num_outputs, + TFE_TensorHandle **outputs, TF_Status *status); + + void RunGeGraphPin2CpuAnonymous(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &gdef, + int num_inputs, TFE_TensorHandle **inputs, int num_outputs, + TFE_TensorHandle **outputs, TF_Status *status); + + void RunGeGraphPin2NpuAnonymous(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &gdef, + int num_inputs, TFE_TensorHandle **inputs, int num_outputs, + TFE_TensorHandle **outputs, TF_Status *status); + + uint64_t AddGeGraph(TFE_Context *context, const std::string &name, const tensorflow::GraphDef &def, + TF_Status *status); + + void RemoveGeGraph(TFE_Context *context, uint64_t graph_id, TF_Status *status); + + void RunGeGraph(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, bool pin_to_npu, + const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs, TF_Status *status); + + void RunGeGraphPin2Cpu(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, + const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs, + TF_Status *status); + + void RunGeGraphPin2Npu(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, + const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs, + TF_Status *status); + + void RunGeGraphAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, + bool pin_to_npu, const TensorDataTypes &output_types, int num_outputs, + TFE_TensorHandle **outputs, DoneCallback done, TF_Status *status); + + void RunGeGraphPin2CpuAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, + const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs, + DoneCallback done, TF_Status *status); + + void RunGeGraphPin2NpuAsync(TFE_Context *context, uint64_t graph_id, int num_inputs, TFE_TensorHandle **inputs, + const TensorDataTypes &output_types, int num_outputs, TFE_TensorHandle **outputs, + DoneCallback done, TF_Status *status); + + void GetCachedTaskSpec(const tensorflow::NodeDef &ndef, std::shared_ptr *spec, + bool &request_shape); + + void GetCachedTaskSpec(const tensorflow::NodeDef &ndef, const TensorShapes &shapes, + std::shared_ptr *spec); + + std::shared_ptr + CacheFuncSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, const tensorflow::NodeDef &ndef, + uint64_t ge_graph_id, std::unique_ptr graph, + const npu::FuncSpec::PruneInputsFunc &prune_func, + const std::vector &dependent_host_resources, const std::string &reason); + + std::shared_ptr CacheOpSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, + const tensorflow::NodeDef &ndef, const TensorShapes &input_shapes, + const TensorPartialShapes &output_shapes, const std::string &reason); + + std::shared_ptr CacheOpSpec(const char *op, const tensorflow::OpRegistrationData *op_spec, + const tensorflow::NodeDef &ndef, const TensorShapes &input_shapes, + const std::string &reason); + + bool Supported(const std::string &op); + + bool SupportedResourceGenerator(const std::string &op); + + void RecordIteratorMirror(const tensorflow::ResourceHandle &src, const TensorPartialShapes &shapes, + const TensorDataTypes &types); + + bool MirroredIterator(const tensorflow::ResourceHandle &src); + + void CreateIteratorProvider(TFE_Context *context, const tensorflow::Tensor *tensor, std::vector device_ids, + TF_Status *status); + + tensorflow::Status ConsumeIteratorSync(const tensorflow::ResourceHandle &resource, int64_t nums); + + tensorflow::Status ConsumeIteratorAsync(const tensorflow::ResourceHandle &resource, int64_t nums, + const DoneCallback &done); + + bool Mirrored(const tensorflow::ResourceHandle &src); + + tensorflow::Status GetMirroredIteratorShapesAndTypes(const tensorflow::ResourceHandle &src, + TensorPartialShapes &shapes, TensorDataTypes &types); + + uint64_t NextUUID() { return uuid.fetch_add(1); } + + ge::Session *GeSession() { return ge_session_; } + + int device_id; + tensorflow::string device_name; + tensorflow::string underlying_device; + + private: + static HashKey Hash(const TensorDataTypes &types) { + if (types.empty()) { return 0; } + HashKey hash = tensorflow::Hash64(tensorflow::DataTypeString(types[0])); + for (size_t i = 1; i < types.size(); i++) { + hash = tensorflow::Hash64Combine(hash, tensorflow::Hash64(tensorflow::DataTypeString(types[i]))); + } + return hash; + } + static HashKey Hash(const TensorShapes &shapes) { + if (shapes.empty()) { return 0; } + HashKey hash = tensorflow::Hash64(shapes[0].DebugString()); + for (size_t i = 1; i < shapes.size(); i++) { + hash = tensorflow::Hash64Combine(hash, tensorflow::Hash64(shapes[i].DebugString())); + } + return hash; + } + static HashKey Hash(const TFE_OpAttrs *attributes) { + tensorflow::AttrValueMap attrs; + tensorflow::unwrap(attributes)->FillAttrValueMapWithoutDefaults(&attrs); + if (attrs.empty()) { return 0; } + auto iter = attrs.begin(); + HashKey hash = tensorflow::Hash64(iter->second.DebugString()); + iter++; + while (iter != attrs.end()) { + hash = tensorflow::Hash64Combine(hash, tensorflow::Hash64(iter->second.DebugString())); + iter++; + } + return hash; + } + + static HashKey Hash(const tensorflow::NodeDef &ndef) { return tensorflow::Hash64(ndef.DebugString()); } + + ge::Session *ge_session_; + std::atomic uuid{0}; + CachedOpSpecs cached_op_specs_; + CachedFuncSpecs cached_func_specs_; + std::map, ResourceCompare> + iterator_mirrors_; + std::map, ResourceCompare> iterator_providers_; +}; + +#endif // TENSORFLOW_C_EAGER_CUSTOM_DEVICE_TESTUTIL_H_ diff --git a/tf_adapter_2.x/npu_device/core/npu_device_register.cpp b/tf_adapter_2.x/npu_device/core/npu_device_register.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c7613a8975cf40fa7867f1f7bdb572f152bc6f9c --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_device_register.cpp @@ -0,0 +1,82 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include "tensorflow/core/platform/logging.h" + +#include "npu_device.h" +#include "npu_logger.h" +#include "npu_micros.h" +#include "npu_unwrap.h" +#include "npu_utils.h" + +namespace { + +TFE_TensorHandle *CopyTensorToNpuDevice(TFE_Context *context, TFE_TensorHandle *tensor, TF_Status *status, + void *device_info) { + auto *dev = reinterpret_cast(device_info); + tensorflow::Status tf_status; + LOG(INFO) << "[CopyTensorToNpuDevice] Copy tensor from " << tensorflow::unwrap(tensor)->DeviceName(&tf_status) + << " to " << dev->device_name; + TFE_TensorHandle *npu_handle = dev->CopyTensorH2D(context, tensor, status); + if (TF_GetCode(status) != TF_OK) return nullptr; + return npu_handle; +} + +TFE_TensorHandle *CopyTensorFromNpuDevice(TFE_Context *context, TFE_TensorHandle *tensor, + const char *target_device_name, TF_Status *status, void *device_info) { + auto *dev = reinterpret_cast(device_info); + DLOG() << "[CopyTensorFromNpuDevice] Copy tensor from " << dev->device_name << " to " << target_device_name; + // 输入的TensorHandle是NPU的,应当先进行NPU->CPU的传输,再调用TFE_TensorHandleCopyToDevice防止可能的NPU->GPU传输 + // 一旦Copy动作发生,需要进行stream同步。如果是NPU->NPU的拷贝(理论上不应该发生),可以不同步。 + TFE_TensorHandle *local_tensor = dev->CopyTensorD2H(context, tensor, status); + if (TF_GetCode(status) != TF_OK) return nullptr; + TFE_TensorHandle *target_tensor = TFE_TensorHandleCopyToDevice(local_tensor, context, target_device_name, status); + if (TF_GetCode(status) != TF_OK) return nullptr; + + TFE_DeleteTensorHandle(local_tensor); + return target_tensor; +} + +void NpuDeviceExecute(const TFE_Op *op, int *num_outputs, TFE_TensorHandle **outputs, TF_Status *s, void *device_info) { + auto *dev = reinterpret_cast(device_info); + dev->Execute(op, num_outputs, outputs, s); +} + +void DeleteNpuDevice(void *device_info) { NpuDevice::DeleteDevice(device_info); } + +void RegisterNpuDevice(TFE_Context *context, const char *name, void *device_info, TF_Status *status) { + TFE_CustomDevice custom_device; + custom_device.copy_tensor_to_device = &CopyTensorToNpuDevice; + custom_device.copy_tensor_from_device = &CopyTensorFromNpuDevice; + custom_device.delete_device = &DeleteNpuDevice; + custom_device.execute = &NpuDeviceExecute; + TFE_RegisterCustomDevice(context, custom_device, name, device_info, status); +} + +std::vector devices_instances; +} // namespace + +std::string CreateDevice(TFE_Context *context, const char *name, int device_index, + const std::map &session_options) { + const static std::string kSucceed; + + NpuDevice *device = nullptr; + auto create_status = NpuDevice::CreateDevice(name, device_index, session_options, &device); + if (create_status != kSucceed) { return create_status; } + devices_instances.push_back(device); + + std::unique_ptr status(TF_NewStatus(), TF_DeleteStatus); + RegisterNpuDevice(context, name, device, status.get()); + if (TF_GetCode(status.get()) != TF_OK) { + return std::string("Register Npu device ") + name + " failed:" + TF_Message(status.get()); + } + LOG(INFO) << "Npu device instance " << name << " created"; + + return kSucceed; +} + +void ReleaseDeviceResource() { + for (auto device : devices_instances) { device->ReleaseResource(); } +} \ No newline at end of file diff --git a/tf_adapter_2.x/npu_device/core/npu_device_register.h b/tf_adapter_2.x/npu_device/core/npu_device_register.h new file mode 100644 index 0000000000000000000000000000000000000000..0b36beb1b2c31081f1d2f3ace6d082afca325ada --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_device_register.h @@ -0,0 +1,18 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_DEVICE_REGISTER_H_ +#define TENSORFLOW_NPU_DEVICE_REGISTER_H_ + +#include "tensorflow/c/eager/c_api.h" +#include +#include + +std::string CreateDevice(TFE_Context *context, const char *device_name, int device_index, + const std::map &session_options); + +void ReleaseDeviceResource(); + +#endif // TENSORFLOW_C_EAGER_NPU_DEVICE_TESTUTIL_H_ diff --git a/tf_adapter_2.x/npu_device/core/npu_dp.h b/tf_adapter_2.x/npu_device/core/npu_dp.h new file mode 100644 index 0000000000000000000000000000000000000000..a0a1d92a34defc8e33f77b27865058a84073e1d1 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_dp.h @@ -0,0 +1,126 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_DP_H +#define TENSORFLOW_NPU_DP_H + +#include "tensorflow/c/c_api.h" +#include + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" + +#include "npu_types.h" + +class IteratorResourceProvider { + using ConsumeFunc = std::function; + using DestroyFunc = std::function; + using DoneCallback = std::function; + + public: + tensorflow::Status Consume(int64_t nums, const DoneCallback &done) { + { + if (stopped_) { return tensorflow::errors::Internal("Iterator resource provider ", name_, " has stopped"); } + std::unique_lock lk(mu_); + if (request_stop_) { return tensorflow::errors::Internal("Iterator resource provider ", name_, " is stopping"); } + requests_.emplace(nums, done); + } + cv_.notify_one(); + return tensorflow::Status::OK(); + } + tensorflow::Status Destroy() { + { + std::unique_lock lk(mu_); + request_stop_ = true; + } + cv_.notify_one(); + while (!stopped_) {} + return destroy_func_(); + } + + IteratorResourceProvider(std::string name, ConsumeFunc cf, DestroyFunc df) + : name_(std::move(name)), consume_func_(std::move(cf)), destroy_func_(std::move(df)), request_stop_(false), + stopped_(false) { + worker_.reset( + tensorflow::Env::Default()->StartThread(tensorflow::ThreadOptions{}, name_ + "_hdc_provider", [this]() { + while (true) { + std::unique_lock lk(mu_); + cv_.wait(lk, [this]() -> bool { return !requests_.empty() || request_stop_; }); + if (request_stop_) { + stopped_.store(true); + return; + } + auto task = requests_.front(); + requests_.pop(); + lk.unlock(); + int64_t nums = task.first; + auto done = task.second; + tensorflow::Status status = tensorflow::Status::OK(); + while (nums-- > 0 && status.ok()) { status = consume_func_(); } + done(status); + } + })); + } + ~IteratorResourceProvider() { + { + std::unique_lock lk(mu_); + stopped_ = true; + } + cv_.notify_one(); + } + static tensorflow::FunctionDef GetFunctionDef(std::string channel_name, std::vector device_ids, + const TensorPartialShapes &shapes, const TensorDataTypes &types, + TF_Status *status) { + tensorflow::FunctionDef fdef; + std::unique_ptr graph = std::make_unique(tensorflow::OpRegistry::Global()); + + tensorflow::Node *arg_iterator = nullptr; + tensorflow::Node *iterator_h2d = nullptr; + + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder("arg_iterator", "_Arg") + .Attr("index", 0) + .Attr("T", tensorflow::DT_RESOURCE) + .Finalize(graph.get(), &arg_iterator), + fdef); + + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder("iterator_h2d", "IteratorH2D") + .Input(arg_iterator, 0) + .Attr("device_ids", device_ids) + .Attr("channel_name", channel_name) + .Finalize(graph.get(), &iterator_h2d), + fdef); + + NPU_CTX_REQUIRES_OK_RETURN(status, tensorflow::GraphToFunctionDef(*graph, "dp_provider_" + channel_name, &fdef), + fdef); + return fdef; + } + + private: + std::string name_; + ConsumeFunc consume_func_; + DestroyFunc destroy_func_; + bool request_stop_; + std::atomic_bool stopped_{false}; + std::mutex mu_; + std::condition_variable cv_; + std::queue> requests_; + std::unique_ptr worker_; +}; + +#endif //TENSORFLOW_NPU_DP_H diff --git a/tf_adapter_2.x/npu_device/core/npu_env.h b/tf_adapter_2.x/npu_device/core/npu_env.h new file mode 100644 index 0000000000000000000000000000000000000000..af976b88670ff4c55289793221f2212358f82e38 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_env.h @@ -0,0 +1,47 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_ENV_H +#define TENSORFLOW_NPU_ENV_H + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/env_var.h" + +const static bool kDumpExecutionDetail = []() -> bool { + bool dump_execute_detail = false; + tensorflow::ReadBoolFromEnvVar("NPU_DEBUG", false, &dump_execute_detail); + return dump_execute_detail; +}(); + +const static bool kDumpGraph = []() -> bool { + bool dump_graph = false; + tensorflow::ReadBoolFromEnvVar("NPU_DUMP_GRAPH", false, &dump_graph); + return dump_graph; +}(); + +const static bool kCustomKernelEnabled = []() -> bool { + bool use_custom_kernel = true; + tensorflow::ReadBoolFromEnvVar("NPU_ENABLE_CUSTOM_KERNEL", true, &use_custom_kernel); + return use_custom_kernel; +}(); + +const static int64_t kGlobalLoopSize = []() -> int64_t { + tensorflow::int64 loop_size = 1; + tensorflow::ReadInt64FromEnvVar("NPU_LOOP_SIZE", 1, &loop_size); + return loop_size; +}(); + +const static bool kPerfEnabled = []() -> bool { + bool perf_enabled = false; + tensorflow::ReadBoolFromEnvVar("NPU_ENABLE_PERF", false, &perf_enabled); + return perf_enabled; +}(); + +#endif //TENSORFLOW_NPU_ENV_H diff --git a/tf_adapter_2.x/npu_device/core/npu_hdc.cpp b/tf_adapter_2.x/npu_device/core/npu_hdc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..da8203ce95367d1a6225320fc7a18d834eb89702 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_hdc.cpp @@ -0,0 +1,268 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include "npu_hdc.h" +#include "npu_micros.h" + + +tensorflow::Status MappingTfDtypeToAcl(tensorflow::DataType tf_type, aclDataType &acl_type); + +tensorflow::Status MappingAclDtypeToTf(const aclDataType &acl_type, tensorflow::DataType &tf_type); + +tensorflow::Status AssembleAclTensor2Tensor(acltdtDataItem *item, std::vector &tensors, + bool call_by_channel_receive); + +tensorflow::Status AssembleAclDataset2Tensors(acltdtDataset *acl_dataset, std::vector &out_tensors, + bool call_by_channel_receive); + +tensorflow::Status AssembleTensors2AclDataset(acltdtTensorType acl_type, const std::vector &tensors, + acltdtDataset **acl_dataset); + +tensorflow::Status AssembleTensors2AclDataset(acltdtTensorType acl_type, const std::vector &tensors, + acltdtDataset *acl_dataset); + +tensorflow::Status DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item = true); + +tensorflow::Status RecvTensorByAcl(acltdtChannelHandle *acl_handle, std::vector &tensors); + +tensorflow::Status SendTensorsByAcl(acltdtChannelHandle *acl_handle, acltdtTensorType acl_type, + const std::vector &tensors); + +tensorflow::Status MappingAclDtypeToTf(const aclDataType &acl_type, tensorflow::DataType &tf_type) { + const static std::map type_mapping = { + {ACL_FLOAT, tensorflow::DT_FLOAT}, {ACL_FLOAT16, tensorflow::DT_HALF}, {ACL_INT8, tensorflow::DT_INT8}, + {ACL_INT32, tensorflow::DT_INT32}, {ACL_UINT8, tensorflow::DT_UINT8}, {ACL_INT16, tensorflow::DT_INT16}, + {ACL_UINT16, tensorflow::DT_UINT16}, {ACL_UINT32, tensorflow::DT_UINT32}, {ACL_INT64, tensorflow::DT_INT64}, + {ACL_UINT64, tensorflow::DT_UINT64}, {ACL_DOUBLE, tensorflow::DT_DOUBLE}, {ACL_BOOL, tensorflow::DT_BOOL}, + {ACL_STRING, tensorflow::DT_STRING}}; + auto found = type_mapping.find(acl_type); + if (found == type_mapping.end()) { + return tensorflow::errors::Internal("Hdc channel receive unsupported data type", acl_type); + } + tf_type = found->second; + return tensorflow::Status::OK(); +} + +tensorflow::Status AssembleAclTensor2Tensor(acltdtDataItem *item, std::vector &tensors, + bool call_by_channel_receive) { + acltdtTensorType acl_type = acltdtGetTensorTypeFromItem(item); + if (acl_type == ACL_TENSOR_DATA_END_OF_SEQUENCE) { + LOG(INFO) << "Hdc channel received end-of-sequence for out-feed op."; + return tensorflow::Status::OK(); + } else if (acl_type == ACL_TENSOR_DATA_ABNORMAL) { + LOG(INFO) << "Hdc channel received abnormal for out-feed op."; + return tensorflow::Status::OK(); + } else if (acl_type == ACL_TENSOR_DATA_UNDEFINED) { + LOG(INFO) << "Hdc channel received undefined message type for out-feed op."; + return tensorflow::errors::Internal("Hdc channel received undefined message type for out-feed op."); + } + tensorflow::DataType tf_type; + TF_RETURN_IF_ERROR(MappingAclDtypeToTf(acltdtGetDataTypeFromItem(item), tf_type)); + size_t dim_num = acltdtGetDimNumFromItem(item); + size_t acl_data_len = acltdtGetDataSizeFromItem(item); + char *acl_data = reinterpret_cast(acltdtGetDataAddrFromItem(item)); + if (call_by_channel_receive) { acl_data = const_cast(reinterpret_cast(acl_data)->c_str()); } + if (tf_type == tensorflow::DT_STRING) { + if (dim_num != 0) { return tensorflow::errors::Internal("Hdc channel receive unsupported non-scalar string type"); } + tensorflow::Tensor tensor(tf_type, tensorflow::TensorShape({})); + tensor.scalar()() = std::string(acl_data, acl_data_len); + tensors.emplace_back(std::move(tensor)); + } else if (DataTypeCanUseMemcpy(tf_type)) { + std::vector dims; + dims.resize(dim_num); + if (acltdtGetDimsFromItem(item, dims.data(), dim_num) != ACL_ERROR_NONE) { + return tensorflow::errors::Internal("Failed get dim-size from hdc channel data"); + } + tensorflow::TensorShape tf_shape; + for (auto dim : dims) { tf_shape.AddDim(dim); } + tensorflow::Tensor tensor = tensorflow::Tensor(tf_type, tf_shape); + auto tensor_data = const_cast(tensor.tensor_data().data()); + auto tensor_size = tensor.tensor_data().size(); + if (tensor_size != acl_data_len) { + return tensorflow::errors::Internal("Hdc channel receive size mismatch tensor size acl:", acl_data_len, + " vs. tensorflow:", tensor_size); + } + memcpy(tensor_data, acl_data, tensor_size); + tensors.emplace_back(std::move(tensor)); + } else { + return tensorflow::errors::InvalidArgument("Hdc channel receive un-copyable tensorflow data type", + DataTypeString(tf_type)); + } + return tensorflow::Status::OK(); +} + +tensorflow::Status AssembleAclDataset2Tensors(acltdtDataset *acl_dataset, std::vector &out_tensors, + bool call_by_channel_receive) { + for (size_t i = 0; i < acltdtGetDatasetSize(acl_dataset); i++) { + auto acl_data = acltdtGetDataItem(acl_dataset, i); + if (acl_data == nullptr) { + return tensorflow::errors::Internal("Acl get tensor data from dataset failed when receive tensor data."); + } + TF_RETURN_IF_ERROR(AssembleAclTensor2Tensor(acl_data, out_tensors, call_by_channel_receive)); + } + return tensorflow::Status::OK(); +} + +tensorflow::Status DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item) { + if (include_data_item) { + for (size_t i = 0; i < acltdtGetDatasetSize(acl_dataset); i++) { + if (acltdtDestroyDataItem(acltdtGetDataItem(acl_dataset, i)) != ACL_ERROR_NONE) { + return tensorflow::errors::Internal("Acl destroy tensor data failed."); + } + } + } + if (acltdtDestroyDataset(acl_dataset) != ACL_ERROR_NONE) { + return tensorflow::errors::Internal("Acl destroy tensor dataset failed."); + } + return tensorflow::Status::OK(); +} + +tensorflow::Status RecvTensorByAcl(acltdtChannelHandle *acl_handle, std::vector &tensors) { + auto acl_dataset = acltdtCreateDataset(); + if (acl_dataset == nullptr) { return tensorflow::errors::Internal("Failed create hdc channel."); } + auto acl_status = acltdtReceiveTensor(acl_handle, acl_dataset, -1 /* no timeout */); + + if (acl_status != ACL_ERROR_NONE) { + NPU_LOG_IF_ERROR(DestroyAclDataset(acl_dataset, false)); + return tensorflow::errors::Internal("Failed receive data from hdc channel, acl status:", acl_status); + } + + auto status = AssembleAclDataset2Tensors(acl_dataset, tensors, true /* call by channel receive */); + if (!status.ok()) { + NPU_LOG_IF_ERROR(DestroyAclDataset(acl_dataset, false)); + return status; + } + TF_RETURN_IF_ERROR(DestroyAclDataset(acl_dataset, false)); + return tensorflow::Status::OK(); +} + +tensorflow::Status MappingTfDtypeToAcl(const tensorflow::DataType tf_type, aclDataType &acl_type) { + const static std::map type_mapping = { + {tensorflow::DT_FLOAT, ACL_FLOAT}, {tensorflow::DT_HALF, ACL_FLOAT16}, {tensorflow::DT_INT8, ACL_INT8}, + {tensorflow::DT_INT32, ACL_INT32}, {tensorflow::DT_UINT8, ACL_UINT8}, {tensorflow::DT_INT16, ACL_INT16}, + {tensorflow::DT_UINT16, ACL_UINT16}, {tensorflow::DT_UINT32, ACL_UINT32}, {tensorflow::DT_INT64, ACL_INT64}, + {tensorflow::DT_UINT64, ACL_UINT64}, {tensorflow::DT_DOUBLE, ACL_DOUBLE}, {tensorflow::DT_BOOL, ACL_BOOL}, + {tensorflow::DT_STRING, ACL_STRING}}; + auto found = type_mapping.find(tf_type); + if (found == type_mapping.end()) { + return tensorflow::errors::Internal("Unsupported tensorflow data type ", DataTypeString(tf_type), " by acl."); + } + acl_type = found->second; + return tensorflow::Status::OK(); +} + +tensorflow::Status AssembleTensors2AclDataset(acltdtTensorType acl_type, const std::vector &tensors, + acltdtDataset *acl_dataset) { + if (TF_PREDICT_FALSE(acl_type != ACL_TENSOR_DATA_TENSOR)) { + acltdtDataItem *acl_data = acltdtCreateDataItem(acl_type, nullptr, 0, ACL_BOOL /* whatever */, nullptr, 0); + if (acl_data == nullptr) { + return tensorflow::errors::Internal("Acl create tensor item failed when send end-of-sequence."); + } + if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_ERROR_NONE) { + if (acltdtDestroyDataItem(acl_data) != ACL_ERROR_NONE) { + LOG(ERROR) << "Acl destroy tensor data item failed when send data with type " + << (acl_type == ACL_TENSOR_DATA_END_OF_SEQUENCE ? "ACL_TENSOR_DATA_END_OF_SEQUENCE" + : "ACL_TENSOR_DATA_ABNORMAL"); + } + return tensorflow::errors::Internal("Acl add tensor data to dataset failed when send data with type ", acl_type); + } + return tensorflow::Status::OK(); + } + for (auto &tensor : tensors) { + aclDataType acl_data_type; + TF_RETURN_IF_ERROR(MappingTfDtypeToAcl(tensor.dtype(), acl_data_type)); + acltdtDataItem *acl_data = nullptr; + if (DataTypeCanUseMemcpy(tensor.dtype())) { + auto dims = tensor.shape().dim_sizes(); + acl_data = acltdtCreateDataItem( + ACL_TENSOR_DATA_TENSOR, (dims.empty() ? nullptr : reinterpret_cast(dims.data())), + dims.size(), acl_data_type, const_cast(tensor.tensor_data().data()), tensor.tensor_data().size()); + } else if (tensor.dtype() == tensorflow::DT_STRING) { + if (tensor.dims() != 0) { + return tensorflow::errors::Internal("Acl send got unexpected non-scalar string tensor with dim ", + tensor.dims()); + } + auto value = reinterpret_cast(const_cast(tensor.tensor_data().data())); + // for scalar type, *dims is nullptr and dim_num is 0 + acl_data = acltdtCreateDataItem(ACL_TENSOR_DATA_TENSOR, nullptr, 0, acl_data_type, + const_cast(value->c_str()), value->size()); + } else { + return tensorflow::errors::Internal("Acl send got unexpected data type ", DataTypeString(tensor.dtype())); + } + if (acl_data == nullptr) { + return tensorflow::errors::Internal("Acl create tensor item failed when send tensor data ", tensor.DebugString()); + } + if (acltdtAddDataItem(acl_dataset, acl_data) != ACL_ERROR_NONE) { + if (acltdtDestroyDataItem(acl_data) != ACL_ERROR_NONE) { + LOG(ERROR) << "Acl destroy tensor data item failed when send data with type ACL_TENSOR_DATA_TENSOR"; + } + return tensorflow::errors::Internal("Acl add tensor data to dataset failed when send tensor data."); + } + } + return tensorflow::Status::OK(); +} + +tensorflow::Status AssembleTensors2AclDataset(acltdtTensorType acl_type, const std::vector &tensors, + acltdtDataset **output_acl_dataset) { + auto acl_dataset = acltdtCreateDataset(); + if (acl_dataset == nullptr) { return tensorflow::errors::Internal("Acl create tensor dataset failed"); } + auto status = AssembleTensors2AclDataset(acl_type, tensors, acl_dataset); + if (!status.ok()) { + NPU_LOG_IF_ERROR(DestroyAclDataset(acl_dataset)); + return status; + } + *output_acl_dataset = acl_dataset; + return tensorflow::Status::OK(); +} + +tensorflow::Status SendTensorsByAcl(acltdtChannelHandle *acl_handle, acltdtTensorType acl_type, + const std::vector &tensors) { + acltdtDataset *acl_dataset = nullptr; + + TF_RETURN_IF_ERROR(AssembleTensors2AclDataset(acl_type, tensors, &acl_dataset)); + + auto acl_status = acltdtSendTensor(acl_handle, acl_dataset, -1 /*no timeout*/); + + TF_RETURN_IF_ERROR(DestroyAclDataset(acl_dataset)); + if (acl_status != ACL_ERROR_NONE) { + return tensorflow::errors::Internal("Acl send data failed, acl status:", acl_status); + } + + return tensorflow::Status::OK(); +} + +tensorflow::Status HdcChannel::Create(uint32_t device_id, const std::string& name, + std::shared_ptr *guarded_channel) { + auto channel = new (std::nothrow) HdcChannel(device_id, name); + NPU_REQUIRES(channel, + tensorflow::errors::Internal("Failed allocate memory for hdc channel ", name, " on device ", device_id)); + NPU_REQUIRES_OK(channel->Init()); + guarded_channel->reset(channel); + return tensorflow::Status::OK(); +} + +HdcChannel::~HdcChannel() { + if (acltdtDestroyChannel(handle_) != ACL_ERROR_NONE) { + LOG(ERROR) << "Failed close hdc channel " << name_; + } else { + LOG(INFO) << "Hdc channel " << name_ << " closed"; + } +} + +tensorflow::Status HdcChannel::SendTensors(const std::vector &tensors) { + return SendTensorsByAcl(handle_, ACL_TENSOR_DATA_TENSOR, tensors); +} + +tensorflow::Status HdcChannel::NotifyFinish() { return SendTensorsByAcl(handle_, ACL_TENSOR_DATA_END_OF_SEQUENCE, {}); } + +tensorflow::Status HdcChannel::NotifyAbnormal() { return SendTensorsByAcl(handle_, ACL_TENSOR_DATA_ABNORMAL, {}); } + +HdcChannel::HdcChannel(uint32_t device_id, std::string name) + : handle_(nullptr), device_id_(device_id), name_(std::move(name)) {} +tensorflow::Status HdcChannel::Init() { + handle_ = acltdtCreateChannel(device_id_, name_.c_str()); + if (handle_ == nullptr) { return tensorflow::errors::Internal("Failed create hdc channel by acl"); } + return tensorflow::Status::OK(); +} \ No newline at end of file diff --git a/tf_adapter_2.x/npu_device/core/npu_hdc.h b/tf_adapter_2.x/npu_device/core/npu_hdc.h new file mode 100644 index 0000000000000000000000000000000000000000..aad1e1418c8c40da1abb21128b5f6900a0355a52 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_hdc.h @@ -0,0 +1,36 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_HDC_H +#define TENSORFLOW_NPU_HDC_H + +#include + +#include "acl/acl_tdt.h" +#include "tensorflow/core/framework/tensor.h" + +#include "npu_micros.h" + +class HdcChannel { + public: + static tensorflow::Status Create(uint32_t device_id, const std::string& name, std::shared_ptr *guarded_channel); + + ~HdcChannel(); + + tensorflow::Status SendTensors(const std::vector &tensors); + + tensorflow::Status NotifyFinish(); + + tensorflow::Status NotifyAbnormal(); + + private: + HdcChannel(uint32_t device_id, std::string name); + tensorflow::Status Init(); + acltdtChannelHandle *handle_; + int32_t device_id_; + std::string name_; +}; + +#endif //TENSORFLOW_NPU_HDC_H diff --git a/tf_adapter_2.x/npu_device/core/npu_logger.cpp b/tf_adapter_2.x/npu_device/core/npu_logger.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5236e58dd000165006da4cd3089f8b0842abd6db --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_logger.cpp @@ -0,0 +1,133 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include "npu_logger.h" +#include "tensorflow/c/eager/c_api.h" + +#include +#include +#include +#include +#include + +#include "tensorflow/c/eager/abstract_tensor_handle.h" + +// clang-format off +#include "tensorflow/core/platform/platform.h" +// clang-format on + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/immediate_execution_tensor_handle.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" +#include "tensorflow/c/tf_tensor_internal.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/eager/context.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow/core/protobuf/device_filters.pb.h" +#include "tensorflow/core/protobuf/error_codes.pb.h" +#include "tensorflow/core/util/device_name_utils.h" +#include "tensorflow/core/common_runtime/copy_tensor.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/device_set.h" +#include "tensorflow/core/common_runtime/eager/attr_builder.h" +#include "tensorflow/core/common_runtime/eager/execute.h" +#include "tensorflow/core/common_runtime/eager/shape_inference.h" +#include "tensorflow/core/common_runtime/eager/tensor_handle.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/common_runtime/rendezvous_mgr.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/rendezvous.h" +#include "tensorflow/core/framework/tensor_shape.pb.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/lib/gtl/flatmap.h" +#include "tensorflow/core/lib/gtl/map_util.h" +#include "tensorflow/core/platform/blocking_counter.h" +#include "tensorflow/core/platform/casts.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/notification.h" +#include "tensorflow/core/platform/random.h" +#include "tensorflow/core/platform/refcount.h" +#include "tensorflow/core/platform/stringpiece.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/profiler/lib/traceme.h" +#include "tensorflow/core/public/version.h" +#include "tensorflow/core/util/device_name_utils.h" +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" + +#include "npu_micros.h" +#include "npu_managed_buffer.h" +#include "npu_unwrap.h" +#include "npu_logger.h" +#include "npu_device.h" +#include "npu_utils.h" + +namespace npu { +class ProfManager { + public: + static void RecordOp(const std::string &op, std::string detail, bool is_stateful, bool is_unknown) { + Instance().RecordOpInner(op, detail, is_stateful, is_unknown); + } + + private: + static ProfManager &Instance() { + static ProfManager prof; + return prof; + } + void RecordOpInner(const std::string &op, std::string detail, bool is_stateful, bool is_unknown) { + std::lock_guard lk(mu_); + op_records_[op]++; + if (is_unknown) { unknown_shape_op_records_[op]++; } + if (is_stateful) { stateful_shape_op_records_[op]++; } + op_shape_records_[op].insert(detail); + } + ~ProfManager() { + std::lock_guard lk(mu_); + LOG(INFO) << "All nodes executed by acl"; + for (auto iter = op_records_.begin(); iter != op_records_.end(); iter++) { + LOG(INFO) << iter->first << ":" << iter->second; + } + + LOG(INFO) << "All stateful nodes executed by acl"; + for (auto iter = stateful_shape_op_records_.begin(); iter != stateful_shape_op_records_.end(); iter++) { + LOG(INFO) << iter->first << ":" << iter->second; + } + + LOG(INFO) << "All unknown shape nodes executed by acl"; + for (auto iter = unknown_shape_op_records_.begin(); iter != unknown_shape_op_records_.end(); iter++) { + LOG(INFO) << iter->first << ":" << iter->second; + } + + LOG(INFO) << "All nodes' shape and type detail executed by acl"; + for (auto iter = op_shape_records_.begin(); iter != op_shape_records_.end(); iter++) { + std::stringstream ss; + ss << std::endl << iter->first << ":"; + for (auto status : iter->second) { ss << std::endl << status; } + LOG(INFO) << ss.str(); + } + } + ProfManager() = default; + std::mutex mu_; + std::map op_records_ GUARDED_BY(mu_); + std::map unknown_shape_op_records_ GUARDED_BY(mu_); + std::map stateful_shape_op_records_ GUARDED_BY(mu_); + std::map> op_shape_records_ GUARDED_BY(mu_); +}; +} // namespace npu \ No newline at end of file diff --git a/tf_adapter_2.x/npu_device/core/npu_logger.h b/tf_adapter_2.x/npu_device/core/npu_logger.h new file mode 100644 index 0000000000000000000000000000000000000000..2ea5c5e5d6ee16d7fc91e161d6d4e05e93aaca31 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_logger.h @@ -0,0 +1,55 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_LOGGER_H +#define TENSORFLOW_NPU_LOGGER_H + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/env_var.h" + +#include "npu_env.h" + +#define DLOG() \ + if (kDumpExecutionDetail) LOG(INFO) + +namespace npu { +// TODO:日志适配层,需要对接slog,当前未使用,复用的tensorflow +class Logger : public std::basic_ostringstream { + public: + Logger(const char *f, int line) { *this << f << ":" << line << " "; } + ~Logger() override { std::cerr << str() << std::endl; } +}; + +class Timer : public std::basic_ostringstream { + public: + template + explicit Timer(Args... args) { + *this << tensorflow::strings::StrCat(args...) << " cost "; + }; + void Start() { + if (TF_PREDICT_FALSE(kPerfEnabled)) { start_ = tensorflow::Env::Default()->NowMicros(); } + started_ = true; + } + void Stop() { + if (started_ && TF_PREDICT_FALSE(kPerfEnabled)) { + *this << (tensorflow::Env::Default()->NowMicros() - start_) / 1000 << " ms"; + LOG(INFO) << str(); + } + started_ = false; + } + + private: + uint64_t start_{0}; + bool started_{false}; +}; +} // namespace npu + +#endif //TENSORFLOW_NPU_DEVICE_ACL_BACKENDS_H diff --git a/tf_adapter_2.x/npu_device/core/npu_managed_buffer.cpp b/tf_adapter_2.x/npu_device/core/npu_managed_buffer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9a005607846a9102b1368ee993074c108fdc6c31 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_managed_buffer.cpp @@ -0,0 +1,314 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include "npu_managed_buffer.h" +#include "npu_logger.h" +#include "npu_micros.h" +#include "npu_utils.h" + +#include "acl/acl_op_compiler.h" +#include "acl/acl_rt.h" + +#include "tensorflow/core/common_runtime/dma_helper.h" + +namespace { +class NpuMemory { + public: + static tensorflow::Status Malloc(size_t size, void **memory) { + if (size == 0) { + *memory = nullptr; + return tensorflow::Status::OK(); + } + NPU_REQUIRES_ACL_OK("Malloc npu memory failed for size " + std::to_string(size), + aclrtMalloc(memory, size, ACL_MEM_MALLOC_HUGE_FIRST)); + return tensorflow::Status::OK(); + } + static void Free(void *memory, size_t size, void *arg) { aclrtFree(memory); } +}; + +class RtsStreamGuard { + public: + explicit RtsStreamGuard(aclrtStream stream) : stream_(stream) {} + ~RtsStreamGuard() { + if (stream_ != nullptr) { + aclrtDestroyStream(stream_); + stream_ = nullptr; + } + } + + private: + aclrtStream stream_; +}; + +tensorflow::Status CreateAclTensorDesc(ge::DataType dtype, ge::Format format, const std::vector &shape, + std::shared_ptr *desc) { + aclDataType acl_dtype; + aclFormat acl_format; + NPU_REQUIRES_OK(MapGeType2Acl(dtype, &acl_dtype)); + NPU_REQUIRES_OK(MapGeFormat2Acl(format, &acl_format)); + aclTensorDesc *acl_desc = aclCreateTensorDesc(acl_dtype, shape.size(), shape.data(), acl_format); + NPU_REQUIRES(acl_desc != nullptr, tensorflow::errors::Internal("Failed create acl tensor desc")); + desc->reset(acl_desc, [](aclTensorDesc *desc) { aclDestroyTensorDesc(desc); }); + return tensorflow::Status::OK(); +} + +tensorflow::Status CreateAclDataBuffer(void *data, size_t size, std::shared_ptr *buf) { + aclDataBuffer *acl_buf = aclCreateDataBuffer(data, size); + NPU_REQUIRES(acl_buf != nullptr, tensorflow::errors::Internal("Failed create acl data buffer")); + buf->reset(acl_buf, [](aclDataBuffer *buf) { aclDestroyDataBuffer(buf); }); + return tensorflow::Status::OK(); +} + +tensorflow::Status CreateTransFormatAttr(ge::Format src, ge::Format dst, std::shared_ptr *attr) { + aclopAttr *acl_attr = aclopCreateAttr(); + NPU_REQUIRES(acl_attr != nullptr, tensorflow::errors::Internal("Failed create acl op attr")); + attr->reset(acl_attr, [](aclopAttr *attr) { aclopDestroyAttr(attr); }); + + NPU_REQUIRES_ACL_OK("Acl set op attr src_format failed", + aclopSetAttrString(acl_attr, "src_format", GetFormatName(src))); + + NPU_REQUIRES_ACL_OK("Acl set op attr dst_format failed", + aclopSetAttrString(acl_attr, "dst_format", GetFormatName(dst))); + return tensorflow::Status::OK(); +} + +tensorflow::Status CreateCastDtypeAttr(ge::DataType src, ge::DataType dst, std::shared_ptr *attr) { + aclopAttr *acl_attr = aclopCreateAttr(); + NPU_REQUIRES(acl_attr != nullptr, tensorflow::errors::Internal("")); + attr->reset(acl_attr, [](aclopAttr *attr) { aclopDestroyAttr(attr); }); + + NPU_REQUIRES_ACL_OK("Acl set op attr dst_type failed", + aclopSetAttrInt(acl_attr, "dst_type", static_cast(dst))); + return tensorflow::Status::OK(); +} + +tensorflow::Status ScheduleCastDtypeTask(aclrtStream stream, ge::Format format, const std::vector &shape, + ge::DataType src_dt, ge::DataType dst_dt, void *src_data, void *dst_data, + size_t src_len, size_t dst_len) { + // TODO: 在一些cube格式的极端场景下,data type转换后,shape也会跟着转,这里暂时没有考虑这种场景 + std::shared_ptr input_desc; + NPU_REQUIRES_OK(CreateAclTensorDesc(src_dt, format, shape, &input_desc)); + aclTensorDesc *input_descs[] = {input_desc.get()}; + + std::shared_ptr input_data; + NPU_REQUIRES_OK(CreateAclDataBuffer(src_data, src_len, &input_data)); + aclDataBuffer *input_dbs[] = {input_data.get()}; + + std::shared_ptr output_desc; + NPU_REQUIRES_OK(CreateAclTensorDesc(dst_dt, format, shape, &output_desc)); + aclTensorDesc *output_ds[] = {output_desc.get()}; + + std::shared_ptr output_data; + NPU_REQUIRES_OK(CreateAclDataBuffer(dst_data, dst_len, &output_data)); + aclDataBuffer *output_dbs[] = {output_data.get()}; + + std::shared_ptr attr; + NPU_REQUIRES_OK(CreateCastDtypeAttr(src_dt, dst_dt, &attr)); + NPU_REQUIRES_ACL_OK("Acl compile and execute \'Cast\' op failed", + aclopCompileAndExecute("Cast", 1, input_descs, input_dbs, 1, output_ds, output_dbs, attr.get(), + ACL_ENGINE_AICORE, ACL_COMPILE_SYS, nullptr, stream)); + return tensorflow::Status::OK(); +} + +tensorflow::Status ScheduleTransFormatTask(aclrtStream stream, ge::DataType src_dt, ge::Format src_format, + const std::vector &src_shape, ge::Format dst_format, + const std::vector &dst_shape, void *src_data, void *dst_data, + size_t src_len, size_t dst_len) { + std::shared_ptr input_desc; + NPU_REQUIRES_OK(CreateAclTensorDesc(src_dt, src_format, src_shape, &input_desc)); + aclTensorDesc *input_descs[] = {input_desc.get()}; + + std::shared_ptr input_data; + NPU_REQUIRES_OK(CreateAclDataBuffer(src_data, src_len, &input_data)); + aclDataBuffer *input_dbs[] = {input_data.get()}; + + std::shared_ptr output_desc; + NPU_REQUIRES_OK(CreateAclTensorDesc(src_dt, dst_format, dst_shape, &output_desc)); + aclTensorDesc *output_ds[] = {output_desc.get()}; + + std::shared_ptr output_data; + NPU_REQUIRES_OK(CreateAclDataBuffer(dst_data, dst_len, &output_data)); + aclDataBuffer *output_dbs[] = {output_data.get()}; + + std::shared_ptr attr; + NPU_REQUIRES_OK(CreateTransFormatAttr(src_format, dst_format, &attr)); + NPU_REQUIRES_ACL_OK("Acl compile and execute \'TransData\' op failed", + aclopCompileAndExecute("TransData", 1, input_descs, input_dbs, 1, output_ds, output_dbs, + attr.get(), ACL_ENGINE_AICORE, ACL_COMPILE_SYS, nullptr, stream)); + return tensorflow::Status::OK(); +} +} // namespace + +NpuManagedBuffer::~NpuManagedBuffer() { + if (deallocator_ && size_ > 0) { deallocator_(data_, size_, deallocator_arg_); } +} + +tensorflow::Status NpuManagedBuffer::Create(ge::Format fmt, const tensorflow::TensorShape &shape, + tensorflow::DataType dtype, NpuManagedBuffer **buf) { + std::vector dims; + for (auto dim_size : shape.dim_sizes()) { dims.push_back(dim_size); } + ge::DataType ge_type; + NPU_REQUIRES_OK(MapTfType2Ge(dtype, &ge_type)); + return Create(fmt, dims, ge_type, buf); +} + +tensorflow::Status NpuManagedBuffer::Create(ge::Format format, const std::vector &dims, ge::DataType data_type, + NpuManagedBuffer **buf) { + return Create(format, dims, data_type, format, dims, buf); +} + +tensorflow::Status NpuManagedBuffer::Create(ge::Format format, const std::vector &shape, + ge::DataType data_type, ge::Format origin_format, + const std::vector &origin_shape, NpuManagedBuffer **buf) { + size_t total_bytes; + int dtype_size = ge::GetSizeByDataType(data_type); + NPU_REQUIRES(dtype_size > 0, + tensorflow::errors::Internal("Data type size invalid ", dtype_size, " for ge type enum ", data_type)); + total_bytes = dtype_size; + for (auto dim_size : shape) { + if (dim_size == 0) { + total_bytes = 0; + break; + } + NPU_REQUIRES(dim_size >= 0, tensorflow::errors::InvalidArgument("Dim size invalid for shape ", VecToString(shape))); + NPU_REQUIRES(total_bytes <= total_bytes * dim_size, + tensorflow::errors::InvalidArgument("Total bytes overflow for shape ", VecToString(shape))); + total_bytes *= dim_size; + } + void *data = nullptr; + NPU_REQUIRES_OK(NpuMemory::Malloc(total_bytes, &data)); + auto status = + Create(format, shape, data_type, origin_format, origin_shape, data, total_bytes, nullptr, NpuMemory::Free, buf); + if (!status.ok()) { NpuMemory::Free(data, total_bytes, nullptr); } + return status; +} + +tensorflow::Status NpuManagedBuffer::Create(ge::Format format, const std::vector &shape, + ge::DataType data_type, ge::Format origin_format, + const std::vector &origin_shape, void *addr, size_t size, + void *arg, void (*deallocator)(void *, size_t, void *), + NpuManagedBuffer **buf) { + *buf = new (std::nothrow) NpuManagedBuffer(); + if (*buf == nullptr) { return tensorflow::errors::Internal("Failed malloc host npu buffer handle"); } + (*buf)->format_ = format; + (*buf)->shape_ = shape; + (*buf)->data_type_ = data_type; + (*buf)->origin_format_ = origin_format; + (*buf)->origin_data_type_ = data_type; + (*buf)->origin_shape_ = origin_shape; + + (*buf)->data_ = addr; + (*buf)->size_ = size; + (*buf)->deallocator_arg_ = arg; + (*buf)->deallocator_ = deallocator; + + return tensorflow::Status::OK(); +} + +void NpuManagedBuffer::Destroy(NpuManagedBuffer *buf) { delete buf; } + +tensorflow::Status NpuManagedBuffer::AssembleTo(const tensorflow::Tensor *tensor) { + NPU_REQUIRES(tensor != nullptr, + tensorflow::errors::InvalidArgument("Failed assemble npu buffer to cpu as dst cpu tensor is nullptr")); + DLOG() << "Npu buffer " << DebugString() << " assemble to " << tensor->DebugString(); + tensorflow::DataType dtype; + NPU_REQUIRES_OK(MapGeType2Tf(origin_data_type_, &dtype)); + NPU_REQUIRES(dtype == tensor->dtype(), + tensorflow::errors::InvalidArgument("Data type mismatch when assemble npu buffer to cpu, npu ", + tensorflow::DataTypeString(dtype), " vs. cpu ", + tensorflow::DataTypeString(tensor->dtype()))); + if (size_ == 0) { return tensorflow::Status::OK(); } + if (SameRepresentation()) { + NPU_REQUIRES_OK(DToH(const_cast(tensor->tensor_data().data()), tensor->TotalBytes())); + } else { + NpuManagedBuffer *buf; + NPU_REQUIRES_OK(Create(origin_format_, origin_shape_, origin_data_type_, &buf)); + NpuManagedBuffer::Guarder guarder(buf); + NPU_REQUIRES_OK(TransRepresentationOnNpu(buf)); + buf->DToH(const_cast(tensor->tensor_data().data()), tensor->TotalBytes()); + } + return tensorflow::Status::OK(); +} + +tensorflow::Status NpuManagedBuffer::AssembleFrom(const tensorflow::Tensor *tensor) { + NPU_REQUIRES(tensor != nullptr, + tensorflow::errors::InvalidArgument("Failed assemble npu buffer from cpu as dst cpu tensor is nullptr")); + DLOG() << "Npu buffer " << DebugString() << " assemble from " << tensor->DebugString(); + tensorflow::DataType dtype; + NPU_REQUIRES_OK(MapGeType2Tf(origin_data_type_, &dtype)); + NPU_REQUIRES(dtype == tensor->dtype(), + tensorflow::errors::InvalidArgument("Data type mismatch when assemble npu buffer from cpu, npu ", + tensorflow::DataTypeString(dtype), " vs. cpu ", + tensorflow::DataTypeString(tensor->dtype()))); + if (size_ == 0) { return tensorflow::Status::OK(); } + if (SameRepresentation()) { + NPU_REQUIRES_OK(HToD(const_cast(tensor->tensor_data().data()), tensor->TotalBytes())); + } else { + NpuManagedBuffer *buf; + NPU_REQUIRES_OK(Create(origin_format_, origin_shape_, origin_data_type_, &buf)); + NpuManagedBuffer::Guarder guarder(buf); + NPU_REQUIRES_OK(buf->HToD(const_cast(tensor->tensor_data().data()), tensor->TotalBytes())); + NPU_REQUIRES_OK(buf->TransRepresentationOnNpu(this)); + } + return tensorflow::Status::OK(); +} + +tensorflow::Status NpuManagedBuffer::TransRepresentationOnNpu(NpuManagedBuffer *dst_buff) { + DLOG() << "Trans representation on npu, format " << GetFormatName(format_) << " to " + << GetFormatName(dst_buff->format_) << ", data type " << data_type_ << " to " << dst_buff->data_type_; + NPU_REQUIRES(format_ != dst_buff->format_ || data_type_ != dst_buff->data_type_, tensorflow::errors::Internal("")); + + aclrtStream rts = nullptr; + NPU_REQUIRES_ACL_OK("Acl create stream failed", aclrtCreateStream(&rts)); + RtsStreamGuard rts_guard(rts); + if (format_ == dst_buff->format_) { + NPU_REQUIRES_OK(ScheduleCastDtypeTask(rts, format_, shape_, data_type_, dst_buff->data_type_, data_, + dst_buff->data_, size_, dst_buff->size_)); + } else if (data_type_ == dst_buff->data_type_) { + NPU_REQUIRES_OK(ScheduleTransFormatTask(rts, data_type_, format_, shape_, dst_buff->format_, dst_buff->shape_, + data_, dst_buff->data_, size_, dst_buff->size_)); + } else { + NpuManagedBuffer *buf; + NPU_REQUIRES_OK(Create(format_, shape_, dst_buff->data_type_, &buf)); + NpuManagedBuffer::Guarder guarder(buf); + NPU_REQUIRES_OK(ScheduleCastDtypeTask(rts, format_, shape_, data_type_, dst_buff->data_type_, data_, buf->data_, + size_, buf->size_)); + NPU_REQUIRES_OK(ScheduleTransFormatTask(rts, buf->data_type_, buf->format_, buf->shape_, dst_buff->format_, + dst_buff->shape_, buf->data_, dst_buff->data_, buf->size_, + dst_buff->size_)); + } + NPU_REQUIRES_ACL_OK("Acl synchronize stream failed", aclrtSynchronizeStream(rts)); + return tensorflow::Status::OK(); +} + +tensorflow::Status NpuManagedBuffer::HToD(void *host_data, size_t size) { + NPU_REQUIRES( + size <= size_, + tensorflow::errors::Internal("Failed copy host buffer to npu as size mismatch npu ", size_, " vs. cpu ", size)); + NPU_REQUIRES_ACL_OK("Acl rt-memcpy host to device failed", + aclrtMemcpy(data_, size_, host_data, size, ACL_MEMCPY_HOST_TO_DEVICE)); + return tensorflow::Status::OK(); +} + +tensorflow::Status NpuManagedBuffer::DToH(void *host_data, size_t size) { + NPU_REQUIRES( + size >= size_, + tensorflow::errors::Internal("Failed copy npu buffer to host as size mismatch npu ", size_, " vs. cpu ", size)); + NPU_REQUIRES_ACL_OK("Acl rt-memcpy device to host failed", + aclrtMemcpy(host_data, size, data_, size_, ACL_MEMCPY_DEVICE_TO_HOST)); + return tensorflow::Status::OK(); +} + +std::string NpuManagedBuffer::DebugString() const { + std::stringstream ss; + tensorflow::DataType origin_type; + tensorflow::DataType storage_type; + (void) MapGeType2Tf(origin_data_type_, &origin_type); + (void) MapGeType2Tf(data_type_, &storage_type); + ss << "origin " << GetFormatName(origin_format_) << " " << tensorflow::DataTypeString(origin_type) + << VecToString(origin_shape_) << ", storage " << GetFormatName(origin_format_) << " " + << tensorflow::DataTypeString(storage_type) << VecToString(shape_); + return ss.str(); +} \ No newline at end of file diff --git a/tf_adapter_2.x/npu_device/core/npu_managed_buffer.h b/tf_adapter_2.x/npu_device/core/npu_managed_buffer.h new file mode 100644 index 0000000000000000000000000000000000000000..7b140b9a83d555e605313f8dbe8ee0ee3a5a68fd --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_managed_buffer.h @@ -0,0 +1,82 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_MANAGED_BUFFER_H +#define TENSORFLOW_NPU_MANAGED_BUFFER_H + +#include + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/status.h" + +#include "graph/types.h" + +class NpuManagedBuffer { + public: + static void Destroy(NpuManagedBuffer *buf); + + static tensorflow::Status Create(ge::Format fmt, const tensorflow::TensorShape &shape, tensorflow::DataType dtype, + NpuManagedBuffer **buf); + static tensorflow::Status Create(ge::Format format, const std::vector &shape, ge::DataType data_type, + NpuManagedBuffer **buf); + static tensorflow::Status Create(ge::Format format, const std::vector &shape, ge::DataType data_type, + ge::Format origin_format, const std::vector &origin_shape, + NpuManagedBuffer **buf); + static tensorflow::Status Create(ge::Format format, const std::vector &shape, ge::DataType data_type, + ge::Format origin_format, const std::vector &origin_shape, void *addr, + size_t size, void *arg, void (*deallocator)(void *, size_t, void *), + NpuManagedBuffer **buf); + + // 将输入的CPU Tensor的数据填充到当前buffer管理的NPU内存上,CPU Tensor的格式和type与buffer的成员origin_data_type_和origin_format_一致 + tensorflow::Status AssembleFrom(const tensorflow::Tensor *tensor); + + // 将当前buffer管理的NPU内存上的数据填充到输入的CPU Tensor的数据地址上,CPU Tensor的格式和type与buffer的成员origin_data_type_和origin_format_一致 + tensorflow::Status AssembleTo(const tensorflow::Tensor *tensor); + + bool SameRepresentation() { return origin_format_ == format_ && origin_data_type_ == data_type_; } + + std::string DebugString() const; + + class Guarder { + public: + explicit Guarder(NpuManagedBuffer *buf) : buf_(buf) {} + ~Guarder() { NpuManagedBuffer::Destroy(buf_); } + + private: + NpuManagedBuffer *buf_; + }; + + private: + NpuManagedBuffer() = default; + ~NpuManagedBuffer(); + tensorflow::Status TransRepresentationOnNpu(NpuManagedBuffer *dst_buff); // 在NPU上完成从存储到原始的格式和类型转换 + tensorflow::Status HToD(void *host_data, size_t size); // 将输入的Host内存搬运到管理的NPU内存上 + tensorflow::Status DToH(void *host_data, size_t max_len); // 将管理的NPU内存上的数据搬运到输入的Host内存上 + + ge::DataType origin_data_type_{}; // 原始数据类型,即对应的CPU Tensor的数据类型 + ge::Format origin_format_{}; // 原始内存排布,即对应的CPU Tensor的维度信息,一般都是ND,可能是NCHW或者NHWC + std::vector origin_shape_; // 原始维度信息,即对应的CPU Tensor的原始维度 + ge::DataType data_type_{}; // 在NPU上的存储数据类型 + ge::Format format_{}; // 在NPU上的存储格式 + std::vector shape_; // 对应NPU上的存储格式的维度值 + + size_t size_{}; // NPU上占用的内存大小 + void *data_{}; // NPU地址指针 + void (*deallocator_)(void *, size_t, void *){}; // NP内存的释放函数,内存可能会来自于内存池或者rtMalloc + void *deallocator_arg_{}; // 地址释放时传给释放函数的参数 +}; + +// NpuManagedBuffer是Host的对象,是CPU Tensor管理的对象,是NPU内存的Host句柄,应当在析构函数中释放NPU内存 +static void NpuManagedBufferDeallocator(void *data, size_t len, void *arg) { + NpuManagedBuffer::Destroy(reinterpret_cast(data)); +} + +#endif //TENSORFLOW_NPU_TENSOR_H \ No newline at end of file diff --git a/tf_adapter_2.x/npu_device/core/npu_micros.h b/tf_adapter_2.x/npu_device/core/npu_micros.h new file mode 100644 index 0000000000000000000000000000000000000000..71f9cb42ee40807404d937d579b6c111aad4dfc1 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_micros.h @@ -0,0 +1,108 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_MICROS_H +#define TENSORFLOW_NPU_MICROS_H + +#define NPU_CTX_REQUIRES_OK(CTX, ...) \ + do { \ + CTX->status = (__VA_ARGS__); \ + if (TF_PREDICT_FALSE(!CTX->status.ok())) { \ + LOG(ERROR) << CTX->status.ToString(); \ + return; \ + } \ + } while (0) + +#define NPU_CTX_REQUIRES(CTX, EXP, STATUS) \ + do { \ + if (!TF_PREDICT_TRUE(EXP)) { \ + CTX->status = STATUS; \ + LOG(ERROR) << CTX->status.ToString(); \ + return; \ + } \ + } while (0) + +#define NPU_CTX_REQUIRES_OK_RETURN(CTX, EXP, RET) \ + do { \ + CTX->status = (EXP); \ + if (TF_PREDICT_FALSE(!CTX->status.ok())) { \ + LOG(ERROR) << CTX->status.ToString(); \ + return RET; \ + } \ + } while (0) + +#define NPU_CTX_REQUIRES_RETURN(CTX, EXP, STATUS, RET) \ + do { \ + if (TF_PREDICT_FALSE(!(EXP))) { \ + CTX->status = (STATUS); \ + LOG(ERROR) << CTX->status.ToString(); \ + return RET; \ + } \ + } while (0) + +#define NPU_REQUIRES_OK(...) \ + do { \ + tensorflow::Status _status = (__VA_ARGS__); \ + if (TF_PREDICT_FALSE(!_status.ok())) { \ + LOG(ERROR) << _status.ToString(); \ + return _status; \ + } \ + } while (0) + +#define NPU_REQUIRES(EXP, STATUS) \ + do { \ + if (!TF_PREDICT_TRUE((EXP))) { \ + tensorflow::Status _status = (STATUS); \ + LOG(ERROR) << _status.ToString(); \ + return _status; \ + } \ + } while (0) + +#define NPU_CTX_REQUIRES_GE_OK(CTX, PREFIX, ...) \ + do { \ + ge::Status _status = (__VA_ARGS__); \ + if (TF_PREDICT_FALSE(_status != ge::SUCCESS)) { \ + std::string err_msg = ge::StatusFactory::Instance()->GetErrDesc(_status); \ + if (err_msg.empty()) { err_msg = " code:" + std::to_string(_status); } \ + CTX->status = tensorflow::errors::Internal(PREFIX, ":", err_msg); \ + LOG(ERROR) << CTX->status.ToString(); \ + return; \ + } \ + } while (0) + +#define NPU_CTX_REQUIRES_GE_OK_RETURN(CTX, PREFIX, EXP, RET) \ + do { \ + ge::Status _status = (EXP); \ + if (TF_PREDICT_FALSE(_status != ge::SUCCESS)) { \ + std::string err_msg = ge::StatusFactory::Instance()->GetErrDesc(_status); \ + if (err_msg.empty()) { err_msg = " code:" + std::to_string(_status); } \ + CTX->status = tensorflow::errors::Internal(PREFIX, ":", err_msg); \ + LOG(ERROR) << CTX->status.ToString(); \ + return RET; \ + } \ + } while (0) + +#define NPU_REQUIRES_ACL_OK(PREFIX, ...) \ + do { \ + auto _status = (__VA_ARGS__); \ + if (TF_PREDICT_FALSE(_status != ACL_ERROR_NONE)) { \ + return tensorflow::errors::Internal(PREFIX, ": code:", _status); \ + } \ + } while (0) + +#define NPU_LOG_IF_ERROR(...) \ + do { \ + const ::tensorflow::Status _status = (__VA_ARGS__); \ + if (TF_PREDICT_FALSE(!_status.ok())) LOG(ERROR) << _status.ToString(); \ + } while (0) + +#define HANDLE_ALL_FORMAT() \ + HANDLE_FORMAT(Nd) \ + HANDLE_FORMAT(Nchw) \ + HANDLE_FORMAT(Nc1hwc0) \ + HANDLE_FORMAT(Fz) \ + HANDLE_FORMAT(Hz) + +#endif //TENSORFLOW_NPU_MICROS_H diff --git a/tf_adapter_2.x/npu_device/core/npu_parser.h b/tf_adapter_2.x/npu_device/core/npu_parser.h new file mode 100644 index 0000000000000000000000000000000000000000..ae036e1667e751ab77ae4f99d545246d5a65c729 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_parser.h @@ -0,0 +1,123 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_PARSER_H +#define TENSORFLOW_NPU_PARSER_H + +#include + +#include "npu_types.h" +#include "npu_unwrap.h" +#include "npu_utils.h" + +#include "graph/types.h" + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" + +namespace { +const std::string kInputDesc = "input_tensor_desc"; +const std::string kOutputDesc = "output_tensor_desc"; +const std::string kFormat = "serialize_format"; +const std::string kType = "serialize_datatype"; +const std::string kShape = "serialize_shape"; +const std::string kSubGraph = "SubGraph"; +} // namespace + +template +static tensorflow::AttrValue BuildDescAttr(T shapes, TensorDataTypes types) { + tensorflow::AttrValue desc_attr; + for (size_t i = 0; i < types.size(); i++) { + auto desc = desc_attr.mutable_list()->add_func(); + desc->set_name(std::to_string(i)); + + tensorflow::AttrValue shape_value; + for (int j = 0; j < shapes[i].dims(); j++) { shape_value.mutable_list()->add_i(shapes[i].dim_size(j)); } + desc->mutable_attr()->insert({kShape, shape_value}); + + tensorflow::AttrValue type_value; + type_value.set_i(static_cast(types[i])); + desc->mutable_attr()->insert({kType, type_value}); + + tensorflow::AttrValue format_value; + format_value.set_i(static_cast(ge::Format::FORMAT_NHWC)); + desc->mutable_attr()->insert({kFormat, format_value}); + } + return desc_attr; +} + +static void AssembleDesc(TensorPartialShapes shapes, TensorDataTypes types, const std::string &name, + tensorflow::NodeDef *ndef) { + tensorflow::AddNodeAttr(name, BuildDescAttr(std::move(shapes), std::move(types)), ndef); +} + +static void AssembleDesc(TensorShapes shapes, TensorDataTypes types, const std::string &name, + tensorflow::NodeDef *ndef) { + tensorflow::AddNodeAttr(name, BuildDescAttr(std::move(shapes), std::move(types)), ndef); +} + +static void AssembleInputDesc(TensorPartialShapes shapes, TensorDataTypes types, tensorflow::NodeDef *ndef) { + AssembleDesc(std::move(shapes), std::move(types), kInputDesc, ndef); +} + +static void AssembleOutputDesc(TensorPartialShapes shapes, TensorDataTypes types, tensorflow::NodeDef *ndef) { + AssembleDesc(std::move(shapes), std::move(types), kOutputDesc, ndef); +} + +static void AssembleInputDesc(TensorShapes shapes, TensorDataTypes types, tensorflow::NodeDef *ndef) { + AssembleDesc(std::move(shapes), std::move(types), kInputDesc, ndef); +} + +static void AssembleOutputDesc(TensorShapes shapes, TensorDataTypes types, tensorflow::NodeDef *ndef) { + AssembleDesc(std::move(shapes), std::move(types), kOutputDesc, ndef); +} + +static void AssembleInputDesc(TensorShapes shapes, TensorDataTypes types, tensorflow::Node *n) { + n->AddAttr(kInputDesc, BuildDescAttr(std::move(shapes), std::move(types))); +} + +static void AssembleOutputDesc(TensorShapes shapes, TensorDataTypes types, tensorflow::Node *n) { + n->AddAttr(kOutputDesc, BuildDescAttr(std::move(shapes), std::move(types))); +} + +static void AssembleInputDesc(TensorPartialShapes shapes, TensorDataTypes types, tensorflow::Node *n) { + n->AddAttr(kInputDesc, BuildDescAttr(std::move(shapes), std::move(types))); +} + +static void AssembleOutputDesc(TensorPartialShapes shapes, TensorDataTypes types, tensorflow::Node *n) { + n->AddAttr(kOutputDesc, BuildDescAttr(std::move(shapes), std::move(types))); +} + +static void AssembleOpDef(const tensorflow::OpRegistrationData *op_data, tensorflow::Node *n) { + std::string serialized_op_def; + op_data->op_def.SerializeToString(&serialized_op_def); + n->AddAttr("op_def", serialized_op_def); +} + +static void AssembleOpDef(tensorflow::Node *n) { + const tensorflow::OpRegistrationData *op_reg_data; + tensorflow::OpRegistry::Global()->LookUp(n->type_string(), &op_reg_data); + std::string serialized_op_def; + op_reg_data->op_def.SerializeToString(&serialized_op_def); + n->AddAttr("op_def", serialized_op_def); +} + +static void AssembleOpDef(const tensorflow::OpRegistrationData *op_data, tensorflow::NodeDef *ndef) { + std::string serialized_op_def; + op_data->op_def.SerializeToString(&serialized_op_def); + tensorflow::AddNodeAttr("op_def", serialized_op_def, ndef); +} + +static void AssembleOpDef(tensorflow::NodeDef *ndef) { + const tensorflow::OpRegistrationData *op_reg_data; + tensorflow::OpRegistry::Global()->LookUp(ndef->op(), &op_reg_data); + std::string serialized_op_def; + op_reg_data->op_def.SerializeToString(&serialized_op_def); + tensorflow::AddNodeAttr("op_def", serialized_op_def, ndef); +} + +#endif //TENSORFLOW_NPU_PARSER_H diff --git a/tf_adapter_2.x/npu_device/core/npu_types.h b/tf_adapter_2.x/npu_device/core/npu_types.h new file mode 100644 index 0000000000000000000000000000000000000000..1f13cb212b7c5fb243d1f5bbca2fd618a4b8500a --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_types.h @@ -0,0 +1,24 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_TYPES_H +#define TENSORFLOW_NPU_TYPES_H + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/eager/c_api_internal.h" + +using TensorPartialShapes = tensorflow::gtl::InlinedVector; +using TensorShapes = tensorflow::gtl::InlinedVector; +using TensorDataTypes = tensorflow::gtl::InlinedVector; + +using VecTensorPartialShapes = tensorflow::gtl::InlinedVector; +using VecTensorShapes = tensorflow::gtl::InlinedVector; +using VecTensorDataTypes = tensorflow::gtl::InlinedVector; + +const static tensorflow::TensorShape kScalarShape; + +#endif //TENSORFLOW_NPU_TYPES_H diff --git a/tf_adapter_2.x/npu_device/core/npu_unwrap.h b/tf_adapter_2.x/npu_device/core/npu_unwrap.h new file mode 100644 index 0000000000000000000000000000000000000000..41e7fed80f6916e15a809fb5def1c64e352d2151 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_unwrap.h @@ -0,0 +1,87 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_UNWRAP_H +#define TENSORFLOW_NPU_UNWRAP_H + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/eager/c_api_internal.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/immediate_execution_tensor_handle.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" +#include "tensorflow/c/tf_tensor_internal.h" +#include "tensorflow/core/common_runtime/copy_tensor.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/device_set.h" +#include "tensorflow/core/common_runtime/eager/attr_builder.h" +#include "tensorflow/core/common_runtime/eager/context.h" +#include "tensorflow/core/common_runtime/eager/execute.h" +#include "tensorflow/core/common_runtime/eager/shape_inference.h" +#include "tensorflow/core/common_runtime/eager/tensor_handle.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/common_runtime/rendezvous_mgr.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/rendezvous.h" +#include "tensorflow/core/framework/tensor_shape.pb.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/lib/gtl/flatmap.h" +#include "tensorflow/core/lib/gtl/map_util.h" +#include "tensorflow/core/platform/blocking_counter.h" +#include "tensorflow/core/platform/casts.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/notification.h" +#include "tensorflow/core/platform/random.h" +#include "tensorflow/core/platform/refcount.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow/core/platform/stringpiece.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/profiler/lib/traceme.h" +#include "tensorflow/core/protobuf/device_filters.pb.h" +#include "tensorflow/core/protobuf/error_codes.pb.h" +#include "tensorflow/core/public/version.h" +#include "tensorflow/core/util/device_name_utils.h" +#include "tensorflow/core/util/env_var.h" + +#include "npu_managed_buffer.h" + +namespace npu { +template +static NpuManagedBuffer *Unwrap(const tensorflow::Tensor *tensor) { + return reinterpret_cast(const_cast(tensor->tensor_data().data())); +} + +static tensorflow::EagerContext *UnwrapCtx(TFE_Context *context) { + return tensorflow::ContextFromInterface(tensorflow::unwrap(context)); +} + +static tensorflow::TensorHandle *UnwrapHandle(TFE_TensorHandle *tensor_handle) { + return tensorflow::TensorHandleFromInterface(tensorflow::unwrap(tensor_handle)); +} + +static tensorflow::EagerOperation *UnwrapOp(TFE_Op *op) { + return reinterpret_cast(tensorflow::unwrap(op)); +} + +static tensorflow::Status UnwrapTensor(TFE_TensorHandle *tensor_handle, const tensorflow::Tensor **tensor) { + return UnwrapHandle(tensor_handle)->Tensor(tensor); +} + +} // namespace npu + +#endif //TENSORFLOW_NPU_UNWRAP_H diff --git a/tf_adapter_2.x/npu_device/core/npu_utils.cpp b/tf_adapter_2.x/npu_device/core/npu_utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..92ecc3150fd3eb0f9fd4783ca53825d11a260ec4 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_utils.cpp @@ -0,0 +1,6 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include "npu_utils.h" diff --git a/tf_adapter_2.x/npu_device/core/npu_utils.h b/tf_adapter_2.x/npu_device/core/npu_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..0a44d77577566c8c8f0dcfd157b1c83bb24dfbc1 --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_utils.h @@ -0,0 +1,222 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#ifndef TENSORFLOW_NPU_UTILS_H +#define TENSORFLOW_NPU_UTILS_H + +#include "tensorflow/c/eager/c_api.h" + +#include +#include +#include +#include +#include + +#include "tensorflow/c/eager/abstract_tensor_handle.h" + +// clang-format off +#include "tensorflow/core/platform/platform.h" +// clang-format on + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/immediate_execution_tensor_handle.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" +#include "tensorflow/c/tf_tensor_internal.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/eager/context.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow/core/protobuf/device_filters.pb.h" +#include "tensorflow/core/protobuf/error_codes.pb.h" +#include "tensorflow/core/util/device_name_utils.h" +#include "tensorflow/core/common_runtime/copy_tensor.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/device_set.h" +#include "tensorflow/core/common_runtime/eager/attr_builder.h" +#include "tensorflow/core/common_runtime/eager/execute.h" +#include "tensorflow/core/common_runtime/eager/shape_inference.h" +#include "tensorflow/core/common_runtime/eager/tensor_handle.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/common_runtime/rendezvous_mgr.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/rendezvous.h" +#include "tensorflow/core/framework/tensor_shape.pb.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/lib/gtl/flatmap.h" +#include "tensorflow/core/lib/gtl/map_util.h" +#include "tensorflow/core/platform/blocking_counter.h" +#include "tensorflow/core/platform/casts.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/notification.h" +#include "tensorflow/core/platform/random.h" +#include "tensorflow/core/platform/refcount.h" +#include "tensorflow/core/platform/stringpiece.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/profiler/lib/traceme.h" +#include "tensorflow/core/public/version.h" +#include "tensorflow/core/util/device_name_utils.h" +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" + +#include "npu_env.h" +#include "npu_micros.h" +#include "npu_unwrap.h" + +#include "acl/acl_base.h" +#include "graph/types.h" + +static bool IsNpuTensorHandle(tensorflow::TensorHandle *handle) { + tensorflow::Status status; + tensorflow::DeviceNameUtils::ParsedName parsed_name; + return tensorflow::DeviceNameUtils::ParseFullName(handle->DeviceName(&status), &parsed_name) + && parsed_name.type == "NPU"; +} + +static bool IsCpuTensorHandle(tensorflow::TensorHandle *handle) { + tensorflow::Status status; + tensorflow::DeviceNameUtils::ParsedName parsed_name; + return tensorflow::DeviceNameUtils::ParseFullName(handle->DeviceName(&status), &parsed_name) + && parsed_name.type == "CPU"; +} + +static tensorflow::Status MapGeType2Tf(ge::DataType ge_type, tensorflow::DataType *tf_type) { + static std::map kGeType2Tf = { + {ge::DT_FLOAT, tensorflow::DT_FLOAT}, {ge::DT_DOUBLE, tensorflow::DT_DOUBLE}, + {ge::DT_INT32, tensorflow::DT_INT32}, {ge::DT_UINT8, tensorflow::DT_UINT8}, + {ge::DT_INT16, tensorflow::DT_INT16}, {ge::DT_INT8, tensorflow::DT_INT8}, + {ge::DT_STRING, tensorflow::DT_STRING}, {ge::DT_COMPLEX64, tensorflow::DT_COMPLEX64}, + {ge::DT_INT64, tensorflow::DT_INT64}, {ge::DT_BOOL, tensorflow::DT_BOOL}, + {ge::DT_QINT8, tensorflow::DT_QINT8}, {ge::DT_QUINT8, tensorflow::DT_QUINT8}, + {ge::DT_QINT32, tensorflow::DT_QINT32}, {ge::DT_QINT16, tensorflow::DT_QINT16}, + {ge::DT_QUINT16, tensorflow::DT_QUINT16}, {ge::DT_UINT16, tensorflow::DT_UINT16}, + {ge::DT_COMPLEX128, tensorflow::DT_COMPLEX128}, {ge::DT_RESOURCE, tensorflow::DT_RESOURCE}, + {ge::DT_VARIANT, tensorflow::DT_VARIANT}, {ge::DT_UINT32, tensorflow::DT_UINT32}, + {ge::DT_UINT64, tensorflow::DT_UINT64}, {ge::DT_STRING_REF, tensorflow::DT_STRING_REF}, + {ge::DT_FLOAT16, tensorflow::DT_HALF}, + }; + if (kGeType2Tf.find(ge_type) == kGeType2Tf.end()) { + return tensorflow::errors::InvalidArgument("Unsupport ge data type enmu value ", ge_type, " by tf"); + } + *tf_type = kGeType2Tf[ge_type]; + return tensorflow::Status::OK(); +} + +static tensorflow::Status MapTfType2Ge(tensorflow::DataType tf_type, ge::DataType *ge_type) { + static std::map kTfType2Ge = { + {tensorflow::DT_FLOAT, ge::DT_FLOAT}, {tensorflow::DT_DOUBLE, ge::DT_DOUBLE}, + {tensorflow::DT_INT32, ge::DT_INT32}, {tensorflow::DT_UINT8, ge::DT_UINT8}, + {tensorflow::DT_INT16, ge::DT_INT16}, {tensorflow::DT_INT8, ge::DT_INT8}, + {tensorflow::DT_STRING, ge::DT_STRING}, {tensorflow::DT_COMPLEX64, ge::DT_COMPLEX64}, + {tensorflow::DT_INT64, ge::DT_INT64}, {tensorflow::DT_BOOL, ge::DT_BOOL}, + {tensorflow::DT_QINT8, ge::DT_QINT8}, {tensorflow::DT_QUINT8, ge::DT_QUINT8}, + {tensorflow::DT_QINT32, ge::DT_QINT32}, {tensorflow::DT_QINT16, ge::DT_QINT16}, + {tensorflow::DT_QUINT16, ge::DT_QUINT16}, {tensorflow::DT_UINT16, ge::DT_UINT16}, + {tensorflow::DT_COMPLEX128, ge::DT_COMPLEX128}, {tensorflow::DT_RESOURCE, ge::DT_RESOURCE}, + {tensorflow::DT_VARIANT, ge::DT_VARIANT}, {tensorflow::DT_UINT32, ge::DT_UINT32}, + {tensorflow::DT_UINT64, ge::DT_UINT64}, {tensorflow::DT_STRING_REF, ge::DT_STRING_REF}, + {tensorflow::DT_HALF, ge::DT_FLOAT16}, + }; + if (kTfType2Ge.find(tf_type) == kTfType2Ge.end()) { + return tensorflow::errors::InvalidArgument("Unsupport tf data type enmu value ", ge_type, " by ge"); + } + *ge_type = kTfType2Ge[tf_type]; + return tensorflow::Status::OK(); +} + +static tensorflow::Status MapGeType2Acl(ge::DataType ge_type, aclDataType *acl_type) { + static std::map kGeType2Acl = { + {ge::DT_FLOAT, ACL_FLOAT}, {ge::DT_DOUBLE, ACL_DOUBLE}, {ge::DT_INT32, ACL_INT32}, + {ge::DT_UINT8, ACL_UINT8}, {ge::DT_INT16, ACL_INT16}, {ge::DT_INT8, ACL_INT8}, + {ge::DT_STRING, ACL_STRING}, {ge::DT_INT64, ACL_INT64}, {ge::DT_BOOL, ACL_BOOL}, + {ge::DT_UINT16, ACL_UINT16}, {ge::DT_UINT32, ACL_UINT32}, {ge::DT_UINT64, ACL_UINT64}, + {ge::DT_FLOAT16, ACL_FLOAT16}, + }; + if (kGeType2Acl.find(ge_type) == kGeType2Acl.end()) { + return tensorflow::errors::InvalidArgument("Unsupport ge data type enmu value ", ge_type, " by acl"); + } + *acl_type = kGeType2Acl[ge_type]; + return tensorflow::Status::OK(); +} + +static tensorflow::Status MapGeFormat2Acl(ge::Format ge_format, aclFormat *acl_format) { + static std::map kGeFormat2Acl = {{ge::Format::FORMAT_NCHW, ACL_FORMAT_NCHW}, + {ge::Format::FORMAT_NHWC, ACL_FORMAT_NHWC}, + {ge::Format::FORMAT_ND, ACL_FORMAT_ND}, + {ge::Format::FORMAT_NC1HWC0, ACL_FORMAT_NC1HWC0}, + {ge::Format::FORMAT_FRACTAL_Z, ACL_FORMAT_FRACTAL_Z}, + {ge::Format::FORMAT_NC1HWC0_C04, ACL_FORMAT_NC1HWC0_C04}, + {ge::Format::FORMAT_NDHWC, ACL_FORMAT_NDHWC}, + {ge::Format::FORMAT_FRACTAL_NZ, ACL_FORMAT_FRACTAL_NZ}, + {ge::Format::FORMAT_NCDHW, ACL_FORMAT_NCDHW}, + {ge::Format::FORMAT_NDC1HWC0, ACL_FORMAT_NDC1HWC0}, + {ge::Format::FORMAT_FRACTAL_Z_3D, ACL_FRACTAL_Z_3D}}; + if (kGeFormat2Acl.find(ge_format) == kGeFormat2Acl.end()) { + return tensorflow::errors::InvalidArgument("Unsupport ge format enmu value ", ge_format, " by acl"); + } + *acl_format = kGeFormat2Acl[ge_format]; + return tensorflow::Status::OK(); +} + +// specify the template in utils.cpp if need +template +std::string ToString(T v) { + return std::to_string(v); +} + +template +std::string VecToString(std::vector vec) { + if (vec.empty()) { return "[]"; } + std::string s = "["; + for (size_t i = 0; i < vec.size(); ++i) { + s += ToString(vec[i]); + if (i != vec.size() - 1) { s += ","; } + } + return s + "]"; +} + +// TODO:在GE处理中,变量名称作为唯一标识,对于shared_name是"_"开头的变量,由于tensorflow禁止变量名以"_"开头,所以无法直接将shared_name +// 作为Node的name,对于GE,则没有这个限制,因而,这个函数需要能够屏蔽这种差异。 +static std::string WrapResourceName(const std::string &name) { + if (kCustomKernelEnabled) { return name; } + return "cpu_" + name; +} + +static tensorflow::Status LoadGraphDefProto(const std::string &file, tensorflow::GraphDef *def) { + tensorflow::Status status = tensorflow::Env::Default()->FileExists(file); + if (!status.ok()) { return status; } + if (tensorflow::Env::Default()->IsDirectory(file).ok()) { + return tensorflow::errors::InvalidArgument(file, " is directory"); + } + if (tensorflow::str_util::EndsWith(file, ".pb")) { + ReadBinaryProto(tensorflow::Env::Default(), file, def); + } else if (tensorflow::str_util::EndsWith(file, ".pbtxt")) { + ReadTextProto(tensorflow::Env::Default(), file, def); + } else { + return tensorflow::errors::InvalidArgument(file, " must ends with .pb or .pbtxt"); + } + return tensorflow::Status::OK(); +} + +struct ResourceCompare { + bool operator()(const tensorflow::ResourceHandle &left, const tensorflow::ResourceHandle &right) const { + return left.name() < right.name() || left.container() < right.container() || left.device() < right.device(); + } +}; + +#endif //TENSORFLOW_NPU_UTILS_H diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5106b4bcc31f4b2d40717812678c81b91c794b5d --- /dev/null +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -0,0 +1,111 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include + +#include "Python.h" +#include "pybind11/chrono.h" +#include "pybind11/complex.h" +#include "pybind11/functional.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/c_api_experimental.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/eager/c_api_internal.h" +#include "tensorflow/c/eager/dlpack.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/c/tf_status_helper.h" +#include "tensorflow/compiler/jit/flags.h" +#include "tensorflow/python/eager/pywrap_tensor_conversion.h" +#include "tensorflow/python/eager/pywrap_tfe.h" +#include "tensorflow/python/lib/core/py_exception_registry.h" +#include "tensorflow/python/lib/core/pybind11_lib.h" +#include "tensorflow/python/lib/core/pybind11_status.h" +#include "tensorflow/python/lib/core/safe_ptr.h" +#include "tensorflow/python/util/util.h" + +#include "framework/omg/omg_inner_types.h" +#include "framework/omg/parser/parser_api.h" +#include "ge/ge_api.h" + +#include "npu_device_register.h" + +namespace py = pybind11; + +namespace { +TFE_Context *InputTFE_Context(const py::handle &ctx) { + return static_cast(PyCapsule_GetPointer(ctx.ptr(), nullptr)); +} +std::atomic_bool graph_engine_started{false}; +const std::string kTrain = "1"; +const std::string kOpen = "1"; +} // namespace + +PYBIND11_MODULE(_npu_device_backends, m) { + m.def("Open", + [](const py::handle &context, const char *device_name, int device_index, + std::map global_options, + std::map session_options) -> std::string { + pybind11::gil_scoped_release release; + if (!graph_engine_started.exchange(true)) { + // 只允许在train模式下工作 + global_options[ge::OPTION_GRAPH_RUN_MODE] = kTrain; + global_options[ge::OPTION_EXEC_DEVICE_ID] = std::to_string(device_index); + if (global_options.find(ge::PRECISION_MODE) == global_options.end()) { + global_options[ge::PRECISION_MODE] = "allow_mix_precision"; + } + LOG(INFO) << "Start graph engine with options:"; + for (const auto &option : global_options) { LOG(INFO) << " " << option.first << ":" << option.second; } + auto ge_status = ge::GEInitialize(global_options); + if (ge_status != ge::SUCCESS) { + return "Failed start graph engine:" + ge::StatusFactory::Instance()->GetErrDesc(ge_status); + } + LOG(INFO) << "Start graph engine succeed"; + ge_status = ge::ParserInitialize(global_options); + if (ge_status != ge::SUCCESS) { + return "Failed start tensorflow model parser:" + ge::StatusFactory::Instance()->GetErrDesc(ge_status); + } + LOG(INFO) << "Start tensorflow model parser succeed"; + } + + std::string full_name = tensorflow::strings::StrCat(device_name, ":", device_index); + tensorflow::DeviceNameUtils::ParsedName parsed_name; + if (!tensorflow::DeviceNameUtils::ParseFullName(full_name, &parsed_name)) { + return "Invalid npu device name " + full_name; + } + + LOG(INFO) << "Create device instance " << full_name << " with options:"; + for (const auto &option : session_options) { LOG(INFO) << " " << option.first << ":" << option.second; } + auto status = CreateDevice(InputTFE_Context(context), full_name.c_str(), device_index, session_options); + pybind11::gil_scoped_acquire acquire; + return status; + }); + + m.def("Close", []() { + pybind11::gil_scoped_release release; + ReleaseDeviceResource(); + if (graph_engine_started.exchange(false)) { + auto ge_status = ge::ParserFinalize(); + if (ge_status != ge::SUCCESS) { + LOG(ERROR) << "Failed stop tensorflow model parser:" << ge::StatusFactory::Instance()->GetErrDesc(ge_status); + } else { + LOG(INFO) << "Stop tensorflow model parser succeed"; + } + ge_status = ge::GEFinalize(); + if (ge_status != ge::SUCCESS) { + LOG(ERROR) << "Failed stop graph engine:" << ge::StatusFactory::Instance()->GetErrDesc(ge_status); + } else { + LOG(INFO) << "Stop graph engine succeed"; + } + } + pybind11::gil_scoped_acquire acquire; + }); + + m.def("StupidRepeat", [](const char *device_name, int times) { + for (int i = 0; i < times; i++) { LOG(INFO) << device_name; } + }); +}; diff --git a/tf_adapter_2.x/npu_device/kernels/anonymous_iterator.cpp b/tf_adapter_2.x/npu_device/kernels/anonymous_iterator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..26e86ca4baf9b4a081dbb5c44be33128cd061c87 --- /dev/null +++ b/tf_adapter_2.x/npu_device/kernels/anonymous_iterator.cpp @@ -0,0 +1,50 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include +#include + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" + +#include "npu_custom_kernel.h" + +static auto kernel = [](TFE_Context *context, NpuDevice *dev, const char *op_name, const TFE_OpAttrs *attributes, + int num_inputs, TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs, + TF_Status *status) { + for (int i = 0; i < num_outputs; ++i) { + TFE_TensorHandle *retval = outputs[i]; + if (npu::UnwrapHandle(retval)->DataType() == tensorflow::DT_RESOURCE) { + const tensorflow::Tensor *tensor; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(retval, &tensor)); + std::vector vec_shapes; + TensorPartialShapes shapes; + TensorDataTypes types; + tensorflow::NodeDef ndef; + tensorflow::unwrap(attributes)->FillAttrValueMap(ndef.mutable_attr()); + NPU_CTX_REQUIRES_OK(status, tensorflow::GetNodeAttr(ndef, "output_shapes", &vec_shapes)); + NPU_CTX_REQUIRES_OK(status, tensorflow::GetNodeAttr(ndef, "output_types", &types)); + for (const auto &shape : vec_shapes) { shapes.push_back(shape); } + auto resource = tensor->scalar()(); + DLOG() << "Record mirrored host resource " << resource.DebugString(); + dev->RecordIteratorMirror(resource, shapes, types); + } + } +}; + +NPU_REGISTER_FALLBACK_HOOK("AnonymousIteratorV2", kernel); +NPU_REGISTER_FALLBACK_HOOK("AnonymousIterator", kernel); +NPU_REGISTER_FALLBACK_HOOK("AnonymousMultiDeviceIterator", kernel); \ No newline at end of file diff --git a/tf_adapter_2.x/npu_device/kernels/iterator_h2d.cpp b/tf_adapter_2.x/npu_device/kernels/iterator_h2d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..87a5b875c9e477d37ab54a7ac6600cbae4a16fca --- /dev/null +++ b/tf_adapter_2.x/npu_device/kernels/iterator_h2d.cpp @@ -0,0 +1,70 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/eager/c_api_internal.h" +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/kernels/data/iterator_ops.h" +#include "tensorflow/core/util/env_var.h" + +#include "npu_hdc.h" + +using namespace tensorflow; + +class IteratorH2D : public OpKernel { + public: + explicit IteratorH2D(OpKernelConstruction *ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("channel_name", &channel_name_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ids", &device_ids_)); + } + + void Compute(OpKernelContext *ctx) override { + if (!initialized_.exchange(true)) { + std::stringstream ss; + for (auto device_id : device_ids_) { ss << device_id << " "; } + channels_.resize(device_ids_.size()); + for (size_t i = 0; i < device_ids_.size(); i++) { + OP_REQUIRES_OK(ctx, HdcChannel::Create(device_ids_[i], channel_name_, &channels_[i])); + } + LOG(INFO) << "Hdc channel for iterator resource " << channel_name_ << " to device [" + << ss.str().substr(0, ss.str().size() - 1) << "] created"; + } + + data::IteratorResource *iterator; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator)); + core::ScopedUnref unref_iterator(iterator); + std::vector components; + bool end_of_sequence = false; + + Status status = iterator->GetNext(ctx, &components, &end_of_sequence); + + if (!status.ok()) { + for (auto channel : channels_) { OP_REQUIRES_OK(ctx, channel->NotifyAbnormal()); } + ctx->SetStatus(status); + return; + } else if (end_of_sequence) { + for (auto channel : channels_) { OP_REQUIRES_OK(ctx, channel->NotifyFinish()); } + ctx->SetStatus(errors::OutOfRange("Iterator resource ", channel_name_, " reach end of sequence")); + return; + } + + for (auto channel : channels_) { OP_REQUIRES_OK(ctx, channel->SendTensors(components)); } + } + + private: + std::string channel_name_; + std::vector device_ids_; + std::vector> channels_; + std::atomic_bool initialized_{false}; +}; + +REGISTER_KERNEL_BUILDER(Name("IteratorH2D").Device(DEVICE_CPU).Priority(3), IteratorH2D); diff --git a/tf_adapter_2.x/npu_device/kernels/make_iterator.cpp b/tf_adapter_2.x/npu_device/kernels/make_iterator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8eab3de312a1d69a002d81bdd6ee853b8a276eed --- /dev/null +++ b/tf_adapter_2.x/npu_device/kernels/make_iterator.cpp @@ -0,0 +1,119 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include +#include + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" + +#include "npu_custom_kernel.h" +#include "npu_utils.h" + +class MakeIteratorGraphBuilder { + public: + static tensorflow::GraphDef GetGraph(std::string container_name, std::string shared_name, TensorPartialShapes shapes, + TensorDataTypes types, TF_Status *status) { + tensorflow::GraphDef gdef; + + tensorflow::Graph graph(tensorflow::OpRegistry::Global()); + tensorflow::Node *device_queue; + tensorflow::Node *make_iterator; + tensorflow::Node *iterator_v2; + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder("DeviceQueue_" + shared_name, "DeviceQueueDataset") + .Attr("channel_name", shared_name) + .Attr("output_types", types) + .Attr("output_shapes", shapes) + .Attr("_iterator_name", shared_name) + .Finalize(&graph, &device_queue), + gdef); + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder(WrapResourceName(shared_name), "IteratorV2") + .Attr("container", container_name) + .Attr("shared_name", shared_name) + .Attr("output_types", types) + .Attr("output_shapes", shapes) + .Finalize(&graph, &iterator_v2), + gdef); + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder("InitMakeIterator_" + shared_name, "MakeIterator") + .Attr("_kernel", "dp") + .Attr("_iterator_name", shared_name) + .Input(device_queue, 0) + .Input(iterator_v2, 0) + .Finalize(&graph, &make_iterator), + gdef); + + // TODO:Tensorflow model parser bug,如果名字不是dpop开头的,则会被remove掉 + std::string func_name = "dpop_init_func_" + shared_name; + tensorflow::FunctionDefLibrary fdef_lib; + tensorflow::FunctionDef *fdef = fdef_lib.add_function(); + tensorflow::GraphToFunctionDef(graph, func_name, fdef); + + tensorflow::Graph dpop_graph(tensorflow::OpRegistry::Global()); + + tensorflow::AttrValue function_attr; + function_attr.mutable_func()->set_name(func_name); + + tensorflow::Node *dpop_node; + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder(func_name, "DPOP") + .Input(std::vector{}) + .Attr("Tin", tensorflow::DataTypeVector{}) + .Attr("Tout", tensorflow::DataTypeVector{}) + .Attr("function", function_attr) + .Finalize(&dpop_graph, &dpop_node), + gdef); + AssembleOpDef(dpop_node); + dpop_node->AddAttr("func_def", fdef_lib.SerializeAsString()); + tensorflow::FixupSourceAndSinkEdges(&dpop_graph); + dpop_graph.ToGraphDef(&gdef); + return gdef; + } +}; + +static auto kernel = [](TFE_Context *context, NpuDevice *dev, const char *op_name, const TFE_OpAttrs *attributes, + int num_inputs, TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs, + TF_Status *status) { + for (int j = 0; j < num_inputs; ++j) { + TFE_TensorHandle *input = inputs[j]; + if (npu::UnwrapHandle(input)->DataType() == tensorflow::DT_RESOURCE) { + const tensorflow::Tensor *tensor; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(input, &tensor)); + auto handle = tensor->scalar()(); + TensorPartialShapes shapes; + TensorDataTypes types; + NPU_CTX_REQUIRES_OK(status, dev->GetMirroredIteratorShapesAndTypes(handle, shapes, types)); + auto dp_init_graph = MakeIteratorGraphBuilder::GetGraph(handle.container(), handle.name(), shapes, types, status); + if (TF_GetCode(status) != TF_OK) return; + if (kDumpExecutionDetail && kDumpGraph) { + std::string file_name = "dp_init_" + handle.name() + ".pbtxt"; + LOG(INFO) << "NPU Dump mirrored resource init graph to: " << file_name; + WriteTextProto(tensorflow::Env::Default(), file_name, dp_init_graph); + } + dev->RunGeGraphPin2CpuAnonymous(context, "dp_init_" + handle.name(), dp_init_graph, num_inputs, inputs, 0, + nullptr, status); + if (TF_GetCode(status) != TF_OK) return; + // TODO:针对推荐网络,Provider需要支持1对N的传输,默认只向资源所处的Device发送 + dev->CreateIteratorProvider(context, tensor, {dev->device_id}, status); + if (TF_GetCode(status) != TF_OK) return; + } + } +}; + +NPU_REGISTER_FALLBACK_HOOK("MakeIterator", kernel); +NPU_REGISTER_FALLBACK_HOOK("MultiDeviceIteratorInit", kernel); \ No newline at end of file diff --git a/tf_adapter_2.x/npu_device/kernels/read_variable_op.cpp b/tf_adapter_2.x/npu_device/kernels/read_variable_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6329f8a311e5e9b2290061e2825985b56c52cbb2 --- /dev/null +++ b/tf_adapter_2.x/npu_device/kernels/read_variable_op.cpp @@ -0,0 +1,104 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include +#include + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" + +#include "npu_custom_kernel.h" +#include "npu_utils.h" + +class ReadVariableGraphBuilder { + public: + static tensorflow::GraphDef GetGraph(const tensorflow::ResourceHandle resource, TF_Status *status) { + const std::string &container_name = resource.container(); + const std::string &shared_name = resource.name(); + + TensorDataTypes handle_dtyes; + TensorPartialShapes handle_shapes; + const auto &dtypes_and_shapes = resource.dtypes_and_shapes(); + + for (auto &dtype_and_shape : dtypes_and_shapes) { + handle_dtyes.push_back(dtype_and_shape.dtype); + handle_shapes.push_back(dtype_and_shape.shape); + } + + tensorflow::GraphDef gdef; + + tensorflow::Graph graph(tensorflow::OpRegistry::Global()); + tensorflow::Node *variable; + tensorflow::Node *read_variable; + tensorflow::Node *retval; + + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder(WrapResourceName(shared_name), "VarHandleOp") + .Attr("container", container_name) + .Attr("shared_name", shared_name) + .Attr("dtype", handle_dtyes.front()) + .Attr("shape", handle_shapes.front()) + .Finalize(&graph, &variable), + gdef); + + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder("Read_" + shared_name, "ReadVariableOp") + .Input(variable, 0) + .Attr("dtype", handle_dtyes.front()) + .Finalize(&graph, &read_variable), + gdef); + + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder("Read_" + shared_name + "_Retval", "_Retval") + .Input(read_variable, 0) + .Attr("index", 0) + .Finalize(&graph, &retval), + gdef); + + AssembleOpDef(variable); + AssembleOpDef(read_variable); + + AssembleOutputDesc(TensorShapes({kScalarShape}), {tensorflow::DT_RESOURCE}, variable); + AssembleInputDesc(TensorShapes({kScalarShape}), {tensorflow::DT_RESOURCE}, read_variable); + AssembleOutputDesc(handle_shapes, handle_dtyes, read_variable); + + graph.ToGraphDef(&gdef); + return gdef; + } +}; + +static auto kernel = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec, + const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef, int num_inputs, + TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) { + const tensorflow::Tensor *handle = nullptr; + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(inputs[0], &handle)); + auto resource = handle->scalar()(); + NPU_CTX_REQUIRES(status, resource.dtypes_and_shapes().size() == 1, + tensorflow::errors::Internal(resource.DebugString(), " type and shape size invalid ", + resource.dtypes_and_shapes().size(), " expect 1")); + auto var_read_graph = ReadVariableGraphBuilder::GetGraph(resource, status); + if (TF_GetCode(status) != TF_OK) { return; } + std::string graph_name = "ReadVariableOp_" + resource.name(); + if (kDumpExecutionDetail && kDumpGraph) { + std::string file_name = graph_name + ".pbtxt"; + WriteTextProto(tensorflow::Env::Default(), file_name, var_read_graph); + LOG(INFO) << "NPU Dump variable resource init graph to: " << file_name; + } + + dev->RunGeGraphPin2CpuAnonymous(context, graph_name, var_read_graph, 0, nullptr, num_outputs, outputs, status); +}; + +NPU_REGISTER_CUSTOM_KERNEL("ReadVariableOp", kernel); diff --git a/tf_adapter_2.x/npu_device/kernels/resource_variable_op.cpp b/tf_adapter_2.x/npu_device/kernels/resource_variable_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..66d0afe1c507e7069b9d81ddb701e060fff0293e --- /dev/null +++ b/tf_adapter_2.x/npu_device/kernels/resource_variable_op.cpp @@ -0,0 +1,138 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include +#include + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" + +#include "npu_custom_kernel.h" +#include "npu_utils.h" + +class AssignVariableGraphBuilder { + public: + static tensorflow::GraphDef GetGraph(const std::string &op_name, const std::string &container_name, + const std::string &shared_name, const tensorflow::Tensor &tensor, + TF_Status *status) { + tensorflow::GraphDef gdef; + + tensorflow::Graph graph(tensorflow::OpRegistry::Global()); + tensorflow::Node *variable; + tensorflow::Node *value; + tensorflow::Node *assign_variable; + + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder(WrapResourceName(shared_name), "VarHandleOp") + .Attr("container", container_name) + .Attr("shared_name", shared_name) + .Attr("dtype", tensor.dtype()) + .Attr("shape", tensor.shape()) + .Finalize(&graph, &variable), + gdef); + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder(op_name + "_Value_" + shared_name, "Const") + .Attr("value", tensor) + .Attr("dtype", tensor.dtype()) + .Finalize(&graph, &value), + gdef); + NPU_CTX_REQUIRES_OK_RETURN(status, + tensorflow::NodeBuilder(op_name + "_" + shared_name, op_name) + .Input(variable, 0) + .Input(value, 0) + .Attr("dtype", tensor.dtype()) + .Finalize(&graph, &assign_variable), + gdef); + + AssembleOpDef(variable); + AssembleOpDef(value); + AssembleOpDef(assign_variable); + + AssembleOutputDesc(TensorShapes({kScalarShape}), {tensorflow::DT_RESOURCE}, variable); + AssembleOutputDesc(TensorShapes({tensor.shape()}), {tensor.dtype()}, value); + AssembleInputDesc(TensorShapes({kScalarShape, tensor.shape()}), {tensorflow::DT_RESOURCE, tensor.dtype()}, + assign_variable); + + graph.ToGraphDef(&gdef); + return gdef; + } +}; + +namespace { + +void VariableOpBaseKernel(const std::string &op_name, TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec, + const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef, int num_inputs, + TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) { + const tensorflow::Tensor *handle = nullptr; + const tensorflow::Tensor *value = nullptr; + + std::vector copied_tensor_handles; + TFE_TensorHandle *value_handle = inputs[1]; + if (IsNpuTensorHandle(npu::UnwrapHandle(inputs[1]))) { + value_handle = dev->CopyTensorD2H(context, inputs[1], status); + if (TF_GetCode(status) != TF_OK) return; + copied_tensor_handles.emplace_back(value_handle); + } + + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(inputs[0], &handle)); + auto resource = handle->scalar()(); + NPU_CTX_REQUIRES_OK(status, npu::UnwrapTensor(value_handle, &value)); + DLOG() << "Start run " << op_name << " for resource " << resource.DebugString() << " with value " + << value->DebugString(); + auto var_init_graph = + AssignVariableGraphBuilder::GetGraph(op_name, resource.container(), resource.name(), *value, status); + if (TF_GetCode(status) != TF_OK) { return; } + std::string graph_name = op_name + "_" + resource.name(); + if (kDumpExecutionDetail && kDumpGraph) { + std::string file_name = graph_name + ".pbtxt"; + WriteTextProto(tensorflow::Env::Default(), file_name, var_init_graph); + LOG(INFO) << "NPU Dump variable resource init graph to: " << file_name; + } + + for (auto copied_tensor_handle : copied_tensor_handles) { TFE_DeleteTensorHandle(copied_tensor_handle); } + dev->RunGeGraphPin2CpuAnonymous(context, graph_name, var_init_graph, num_inputs, inputs, num_outputs, outputs, + status); +} + +} // namespace + +static auto kernel_assign = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec, + const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef, + int num_inputs, TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs, + TF_Status *status) { + VariableOpBaseKernel("AssignVariableOp", context, dev, spec, output_shapes, parser_ndef, num_inputs, inputs, + num_outputs, outputs, status); +}; + +static auto kernel_assign_add = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec, + const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef, + int num_inputs, TFE_TensorHandle **inputs, int num_outputs, + TFE_TensorHandle **outputs, TF_Status *status) { + VariableOpBaseKernel("AssignAddVariableOp", context, dev, spec, output_shapes, parser_ndef, num_inputs, inputs, + num_outputs, outputs, status); +}; + +static auto kernel_assign_sub = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec, + const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef, + int num_inputs, TFE_TensorHandle **inputs, int num_outputs, + TFE_TensorHandle **outputs, TF_Status *status) { + VariableOpBaseKernel("AssignSubVariableOp", context, dev, spec, output_shapes, parser_ndef, num_inputs, inputs, + num_outputs, outputs, status); +}; + +NPU_REGISTER_CUSTOM_KERNEL("AssignVariableOp", kernel_assign); +NPU_REGISTER_CUSTOM_KERNEL("AssignAddVariableOp", kernel_assign_add); +NPU_REGISTER_CUSTOM_KERNEL("AssignSubVariableOp", kernel_assign_sub); diff --git a/tf_adapter_2.x/npu_device/kernels/send_h2d.cpp b/tf_adapter_2.x/npu_device/kernels/send_h2d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d960505d4665ced46735f7f3799a0bce346a4de0 --- /dev/null +++ b/tf_adapter_2.x/npu_device/kernels/send_h2d.cpp @@ -0,0 +1,54 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/eager/c_api_internal.h" +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/util/env_var.h" + +#include "npu_hdc.h" + +using namespace tensorflow; + +class SendH2D : public OpKernel { + public: + explicit SendH2D(OpKernelConstruction *ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("channel_name", &channel_name_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ids", &device_ids_)); + } + + void Compute(OpKernelContext *ctx) override { + if (!initialized_.exchange(true)) { + std::stringstream ss; + for (auto device_id : device_ids_) { ss << device_id << " "; } + channels_.resize(device_ids_.size()); + for (size_t i = 0; i < device_ids_.size(); i++) { + OP_REQUIRES_OK(ctx, HdcChannel::Create(device_ids_[i], channel_name_, &channels_[i])); + } + LOG(INFO) << "Hdc channel for iterator resource " << channel_name_ << " to device [" + << ss.str().substr(0, ss.str().size() - 1) << "] created"; + } + OpInputList inputs; + OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &inputs)); + std::vector tensors; + for (int64 i = 0; i < inputs.size(); i++) { tensors.push_back(inputs[i]); } + for (auto channel : channels_) { OP_REQUIRES_OK(ctx, channel->SendTensors(tensors)); } + } + + private: + std::string channel_name_; + std::vector device_ids_; + std::vector> channels_; + std::atomic_bool initialized_{false}; +}; + +REGISTER_KERNEL_BUILDER(Name("SendH2D").Device(DEVICE_CPU).Priority(3), SendH2D); diff --git a/tf_adapter_2.x/npu_device/kernels/var_is_initialized_op.cpp b/tf_adapter_2.x/npu_device/kernels/var_is_initialized_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f1f290f12c133bcd1e6843af45750c5f6d06fe43 --- /dev/null +++ b/tf_adapter_2.x/npu_device/kernels/var_is_initialized_op.cpp @@ -0,0 +1,35 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include +#include + +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/tf_status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/platform/logging.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/immediate_execution_operation.h" +#include "tensorflow/c/eager/tfe_context_internal.h" +#include "tensorflow/c/eager/tfe_op_internal.h" +#include "tensorflow/c/eager/tfe_tensorhandle_internal.h" + +#include "npu_custom_kernel.h" +#include "npu_utils.h" + +static auto kernel = [](TFE_Context *context, NpuDevice *dev, const npu::OpSpec *spec, + const TensorShapes &output_shapes, const tensorflow::NodeDef &parser_ndef, int num_inputs, + TFE_TensorHandle **inputs, int num_outputs, TFE_TensorHandle **outputs, TF_Status *status) { + // TODO:这里需要先判断下是否已经初始化 + tensorflow::Tensor tensor(tensorflow::DT_BOOL, {}); + tensor.scalar()() = true; + outputs[0] = tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(tensor)); +}; + +NPU_REGISTER_CUSTOM_KERNEL("VarIsInitializedOp", kernel); diff --git a/tf_adapter_2.x/npu_device/ops/custom_op.cpp b/tf_adapter_2.x/npu_device/ops/custom_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..30529ba99470a5abd43fe3187894d81181a9c3a6 --- /dev/null +++ b/tf_adapter_2.x/npu_device/ops/custom_op.cpp @@ -0,0 +1,63 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +* Description: Common depends and micro defines for and only for data preprocess module +*/ + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "tensorflow/c/c_api.h" +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/c_api_experimental.h" +#include "tensorflow/c/eager/c_api_internal.h" +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/util/env_var.h" + +using namespace tensorflow; + +class FakeOp : public AsyncOpKernel { + public: + explicit FakeOp(OpKernelConstruction *context) : AsyncOpKernel(context) {} + + void ComputeAsync(OpKernelContext *context, DoneCallback done) override { + OP_REQUIRES_OK_ASYNC( + context, errors::Internal(context->op_kernel().name(), " registered as fake op and should never run on cpu"), + done); + } +}; + +REGISTER_OP("DPOP") + .Input("inputs: Tin") + .Output("outputs: Tout") + .Attr("Tin: list(type) >= 0") + .Attr("Tout: list(type) >= 0") + .Attr("function: func") + .Attr("data_format: { 'NHWC', 'NCHW'} = 'NHWC'") + .SetIsStateful(); + +REGISTER_OP("DeviceQueueDataset") + .Output("handle: variant") + .Attr("channel_name: string") + .Attr("output_types: list(type) >= 1") + .Attr("output_shapes: list(shape) >= 1") + .SetIsStateful() + .SetShapeFn(tensorflow::shape_inference::ScalarShape); + +REGISTER_OP("SendH2D") + .Input("inputs: Tin") + .Attr("channel_name: string") + .Attr("device_ids: list(int)") + .Attr("Tin: list(type) = [DT_FLOAT, DT_HALF, DT_INT8, DT_INT32, DT_UINT8, DT_INT16, DT_UINT16, DT_UINT32, " + "DT_INT64, DT_UINT64, DT_DOUBLE, DT_BOOL, DT_STRING]") + .SetIsStateful(); + +REGISTER_OP("IteratorH2D") + .Input("input: resource") + .Attr("channel_name: string") + .Attr("device_ids: list(int)") + .SetIsStateful(); + +REGISTER_KERNEL_BUILDER(Name("DPOP").Device(DEVICE_CPU).Priority(3), FakeOp); +REGISTER_KERNEL_BUILDER(Name("DeviceQueueDataset").Device(DEVICE_CPU).Priority(3), FakeOp); diff --git a/tf_adapter_2.x/npu_device/python/MANIFEST.in b/tf_adapter_2.x/npu_device/python/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..b6beccfcab7d89ccff6d6201017f412909323c51 --- /dev/null +++ b/tf_adapter_2.x/npu_device/python/MANIFEST.in @@ -0,0 +1,2 @@ +recursive-include * *.py +recursive-include * *.so \ No newline at end of file diff --git a/tf_adapter_2.x/npu_device/python/npu_device/__init__.py b/tf_adapter_2.x/npu_device/python/npu_device/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d4f4b7201094fd85e40e8a10f893786d67d80ab6 --- /dev/null +++ b/tf_adapter_2.x/npu_device/python/npu_device/__init__.py @@ -0,0 +1 @@ +from npu_device.npu_device_register import * \ No newline at end of file diff --git a/tf_adapter_2.x/npu_device/python/npu_device/npu_device_register.py b/tf_adapter_2.x/npu_device/python/npu_device/npu_device_register.py new file mode 100644 index 0000000000000000000000000000000000000000..8e27dc2c145404faa57f9fb2cc4297bf8919d5c7 --- /dev/null +++ b/tf_adapter_2.x/npu_device/python/npu_device/npu_device_register.py @@ -0,0 +1,77 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. +# Description: Common depends and micro defines for and only for data preprocess module + +import tensorflow as tf +from tensorflow.python.eager import context +from sys import version_info as _swig_python_version_info + +if _swig_python_version_info < (2, 7, 0): + raise RuntimeError("Python 2.7 or later required") + +NPU = "/job:localhost/replica:0/task:0/device:NPU" + +# Import the low-level C/C++ module +if __package__ or "." in __name__: + from . import _npu_device_backends +else: + import _npu_device_backends + + +def stupid_repeat(word, times): + return _npu_device_backends.StupidRepeat(word, times) + + +def open(ctx=None, device_index=0, global_options={}, session_options={}): + if ctx is None: + ctx = context.context() + ctx.ensure_initialized() + error_message = _npu_device_backends.Open(ctx._handle, NPU, device_index, global_options, session_options) + if len(error_message): + raise RuntimeError("Failed open npu device " + str(device_index) + ":" + error_message) + return NpuDeviceHandle(ctx, device_index) + + +def close(): + _npu_device_backends.Close() + + +import atexit + +atexit.register(close) +from tensorflow.python.util import tf_contextlib + + +class NpuDeviceHandle(object): + def __init__(self, ctx, device_index): + self._ctx = ctx + self._device_name = NPU + ":" + str(device_index) + + def name(self): + return self._device_name + + def scope(self): + @tf_contextlib.contextmanager + def _scope(): + with self._ctx.device(self._device_name): + yield + + return _scope() + + def as_default(self): + from tensorflow.python.framework import device as pydev + from tensorflow.python.framework import ops + + @tf_contextlib.contextmanager + def combined(): + try: + with context.device(self._device_name): + yield + except ImportError: # ImportError: sys.meta_path is None, Python is likely shutting down + yield + + def _f(*args, **kwargs): + return combined() + + ops.device = _f + self._ctx._set_device(self._device_name, pydev.DeviceSpec.from_string(self._device_name)) + return self diff --git a/tf_adapter_2.x/npu_device/python/setup.py b/tf_adapter_2.x/npu_device/python/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..4fb8a9c4b3e03c1999f5668ef1e8c077ca4fd564 --- /dev/null +++ b/tf_adapter_2.x/npu_device/python/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup, Extension +from setuptools import find_packages + +setup(name='npu_device', + version='0.1', + description='This is a demo package', + long_description='This is a demo package', + packages=find_packages(), + include_package_data=True, + ext_modules=[], + zip_safe=False)