From 121c4553bf751d873d02dad35a5d9791059140b2 Mon Sep 17 00:00:00 2001 From: zhu_tianyu Date: Sun, 9 Oct 2022 15:01:07 +0800 Subject: [PATCH] =?UTF-8?q?=E8=87=AA=E5=AE=9A=E4=B9=89=E7=AE=97=E5=AD=90?= =?UTF-8?q?=E5=8C=85=E7=9B=AE=E5=BD=95=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../4_op_dev/1_custom_op/CMakeLists.txt | 10 +- .../4_op_dev/1_custom_op/README.md | 138 +-- .../4_op_dev/1_custom_op/README_CN.md | 127 +-- .../4_op_dev/1_custom_op/build.sh | 42 +- .../4_op_dev/1_custom_op/cmake/config.cmake | 27 +- .../cmake/util/gen_impl_and_mrege_json.sh | 15 +- .../cmake/util/makeself/makeself-header.sh | 24 +- .../cmake/util/merge_aicpu_info_json.sh | 7 +- .../1_custom_op/cpukernel/CMakeLists.txt | 18 +- .../4_op_dev/1_custom_op/doc/Add_CN.md | 2 +- .../4_op_dev/1_custom_op/doc/Add_EN.md | 2 +- .../4_op_dev/1_custom_op/doc/Conv2d_CN.md | 2 +- .../4_op_dev/1_custom_op/doc/Conv2d_EN.md | 2 +- .../4_op_dev/1_custom_op/doc/LeakyRelu_CN.md | 2 +- .../4_op_dev/1_custom_op/doc/LeakyRelu_EN.md | 2 +- .../4_op_dev/1_custom_op/doc/Matmul_CN.md | 2 +- .../4_op_dev/1_custom_op/doc/Matmul_EN.md | 2 +- .../4_op_dev/1_custom_op/doc/Permute_CN.md | 2 +- .../4_op_dev/1_custom_op/doc/Permute_EN.md | 2 +- .../1_custom_op/doc/ScatterNdAdd_CN.md | 2 +- .../4_op_dev/1_custom_op/doc/Upsample_CN.md | 2 +- .../4_op_dev/1_custom_op/doc/Upsample_EN.md | 2 +- .../scripts/ide_customops_install.sh | 46 +- .../4_op_dev/1_custom_op/scripts/install.sh | 79 +- .../1_custom_op/scripts/testcase_300.sh | 2 +- .../4_op_dev/1_custom_op/scripts/upgrade.sh | 46 +- .../4_op_dev/1_custom_op/tbe/impl/__init__.py | 0 .../4_op_dev/1_custom_op/tbe/impl/add_dsl.py | 117 +++ .../1_custom_op/tbe/impl/conv2d_tik.py | 171 ++++ .../1_custom_op/tbe/impl/leaky_relu_demo.py | 112 +++ .../1_custom_op/tbe/impl/matmul_tik.py | 210 +++++ .../1_custom_op/tbe/impl/permute_tik.py | 587 ++++++++++++ .../1_custom_op/tbe/impl/scatter_nd_add.py | 860 ++++++++++++++++++ .../1_custom_op/tbe/impl/upsample_tik.py | 369 ++++++++ 34 files changed, 2798 insertions(+), 235 deletions(-) create mode 100644 cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/__init__.py create mode 100644 cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/add_dsl.py create mode 100644 cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/conv2d_tik.py create mode 100644 cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/leaky_relu_demo.py create mode 100644 cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/matmul_tik.py create mode 100644 cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/permute_tik.py create mode 100644 cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/scatter_nd_add.py create mode 100644 cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/upsample_tik.py diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/CMakeLists.txt b/cplusplus/level1_single_api/4_op_dev/1_custom_op/CMakeLists.txt index 329daf2ba..5e3886569 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/CMakeLists.txt +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/CMakeLists.txt @@ -83,21 +83,21 @@ add_custom_target(${RUN_TARGET} ALL DEPENDS ${ALL_MODULES}) if (NOT "x$ENV{COMPILE_KERNEL_TARGET}" STREQUAL "xcpu") add_custom_command(TARGET ${RUN_TARGET} PRE_BUILD - COMMAND mkdir -p ./makepkg/packages/op_impl/${PROJECT_DIR}/ai_core/tbe/custom_impl - COMMAND mkdir -p ./makepkg/packages/op_impl/${PROJECT_DIR}/vector_core/tbe/custom_impl + COMMAND mkdir -p ./makepkg/packages/${PROJECT_DIR}/op_impl/ai_core/tbe/impl + COMMAND mkdir -p ./makepkg/packages/${PROJECT_DIR}/op_impl/vector_core/tbe/impl ) endif() if (NOT "x$ENV{COMPILE_KERNEL_TARGET}" STREQUAL "xtbe") add_custom_command(TARGET ${RUN_TARGET} PRE_BUILD - COMMAND mkdir -p ./makepkg/packages/op_impl/${PROJECT_DIR}/cpu/aicpu_kernel/custom_impl + COMMAND mkdir -p ./makepkg/packages/${PROJECT_DIR}/op_impl/cpu/aicpu_kernel/impl ) endif() add_custom_command(TARGET ${RUN_TARGET} PRE_BUILD - COMMAND mkdir -p ./makepkg/packages/fusion_rules/${PROJECT_DIR}/ + COMMAND mkdir -p ./makepkg/packages/${PROJECT_DIR}/fusion_rules/ COMMAND bash ${CMAKE_SOURCE_DIR}/cmake/util/gen_impl_and_mrege_json.sh ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} COMMAND cp ${CMAKE_SOURCE_DIR}/scripts/install.sh ./makepkg/ COMMAND cp ${CMAKE_SOURCE_DIR}/scripts/upgrade.sh ./makepkg/ @@ -108,7 +108,7 @@ add_custom_command(TARGET ${RUN_TARGET} if(EXISTS ${CMAKE_SOURCE_DIR}/custom.proto) add_custom_command(TARGET ${RUN_TARGET} PRE_BUILD - COMMAND cp ${CMAKE_SOURCE_DIR}/custom.proto ./makepkg/packages + COMMAND cp ${CMAKE_SOURCE_DIR}/custom.proto ./makepkg/packages/${PROJECT_DIR} ) endif() diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/README.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/README.md index bad1b1bea..fe81f01e4 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/README.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/README.md @@ -46,7 +46,7 @@ The directory of a Caffe or TensorFlow custom operator sample project is organiz │ ├── CMakeLists.txt // CMakeList.txt of the operator IR definition file, called by CMakeList.txt of the operator project ├── tbe │ ├── CMakeLists.txt -│ ├── custom_impl // Directory of operator implementation files +│ ├── impl // Directory of operator implementation files │ │ ├── xx.py │ │ ├── __init__.py // Python package identification file │ ├── op_info_cfg // Directory of operator information library files @@ -54,7 +54,10 @@ The directory of a Caffe or TensorFlow custom operator sample project is organiz │ ├── ${SoC Version} // Ascend AI Processor model │ ├── xx.ini │ ├── testcase -│ ├── tf_test // Directory of the TensorFlow-based operator test file. The code in this directory can run only on the Ascend 910 AI processor. +│ ├── tf1.15_test // Directory of the TensorFlow 1.15-based operator test file. The code in this directory can run only on the Ascend 910 AI processor. +│ ├── op_name // Code for verifying a single-operator on network +│ ├── tf_xx.py +│ ├── tf2.6_test // Directory of the TensorFlow 2.6-based operator test file. The code in this directory can run only on the Ascend 910 AI processor. │ ├── op_name // Code for verifying a single-operator on network │ ├── tf_xx.py ├── cmake @@ -96,7 +99,6 @@ The directory of a Caffe or TensorFlow custom operator sample project is organiz ## Environment Requirements -- OS and architecture: CentOS x86\_64, CentOS AArch64, Ubuntu 18.04 x86\_64, Ubuntu 18.04 aarch64, EulerOS x86, EulerOS AArch64 - Python version and dependency library: Python 3.7.*x* (3.7.0 to 3.7.11) and Python 3.8.*x* (3.8.0 to 3.8.11). - Ascend AI Software Stack deployed @@ -119,15 +121,6 @@ The directory of a Caffe or TensorFlow custom operator sample project is organiz ``` -- Operator building requires Python installation. The following takes Python 3.7.5 as an example. Run the following commands as a running user to set the environment variables related to Python 3.7.5: - - ``` - # Set tje Python3.7.5 library path. - export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib:$LD_LIBRARY_PATH - # If multiple Python 3 versions exist in the user environment, specify Python 3.7.5. - export PATH=/usr/local/python3.7.5/bin:$PATH - ``` - Replace the Python 3.7.5 installation path as required. You can also write the preceding commands to the ~/.bashrc file and run the source ~/.bashrc command to make the modification take effect immediately. ## Operator Project Building @@ -204,37 +197,35 @@ The directory of a Caffe or TensorFlow custom operator sample project is organiz **export AICPU_KERNEL_TARGET=xxx** - - **AICPU\_SOC\_VERSION** : Ascend AI Processor version. Set it to the folder name of the corresponding product in **opp/op\_impl/built-in/aicpu/aicpu\_kernel/lib** under the AI CPU installation directory, that is, the name of the folder where **libcpu\_kernels\_context.a** and **libcpu\_kernels\_v1.0.1.so** are located. - + - **AICPU\_SOC\_VERSION** : Ascend AI Processor version. Set it to the folder name of the corresponding product in **opp/built-in/op_impl/aicpu/aicpu_kernel/lib** under the AI CPU installation directory, that is, the name of the folder where AI CPU library files ar +e located. + + - **vendor_name** :Name of the vendor to which the custom operator belongs. The value can be customized by developers. The default value is **customize** . 3. Build the operator project. - - To compile only the TBE operator, run the following command in the operator project directory. + - To compile only the TBE operator, run the following command in the operator project directory. - **chmod +x build.sh** + **chmod +x build.sh** - **./build.sh -t** + **./build.sh -t** - - To compile only the AI CPU operator, run the following command in the operator project directory. + - To compile only the AI CPU operator, run the following command in the operator project directory. - **chmod +x build.sh** + **chmod +x build.sh** - **./build.sh -c** + **./build.sh -c** - - If you need to compile both the TBE and AI CPU operators, run the following command in the operator project directory. + - If you need to compile both the TBE and AI CPU operators, run the following command in the operator project directory. - **chmod +x build.sh** + **chmod +x build.sh** - **./build.sh** + **./build.sh** After successful build, an OPP runfile **custom\_opp\__\__.run** is generated in the **build\_out** directory. - **Note**: - - - If you need to rebuild the project, run the **./build.sh clean** command to clear the build outputs. - - - If the custom operator that you develop contains both the TBE and AI CPU operators, compile and generate a customized operator installation package. In the current version, only one customized operator installation package can be installed. A later customized operator package will overwrite the previously installed operator package. - + **Note: If you need to rebuild the project, run the **./build.sh clean** command to clear the build outputs.** + ## Operator Deployment @@ -244,45 +235,70 @@ The directory of a Caffe or TensorFlow custom operator sample project is organiz **./custom\_opp\__\__.run** - After the command is executed successfully, the custom operator files generated after build are deployed in the custom directory of the OPP directory as follows: + - After the installation is successful, the custom operator files generated during compilation are deployed to "opp/vendors/ __" directory. __ is the value of the field "vendor_name" in the build.sh script when compiling the operator project. The default value is "customize". + - If a user-defined operator with the same "vendor_name" already exists in the operator library, a prompt message similar to the following will appear (take updating the framework as an example): + ``` + [ops_custom]upgrade framework + caffe onnx tensorflow [INFO]: has old version in /usr/local/Ascend/latest/opp_x86_64-linux/vendors/customize/framework: + - Overlay Installation, please enter:[o] + - Replace directory installation, please enter:[r] + - Do not install, please enter:[n] + ``` + - Enter "o" to overwrite the installation, that is, if the file in the installation package has the same name as the existing file, replace the original file with the file in the installation package; If the installation package does not contain an existing file, the existing file is retained. + + - Enter "r" to represent a new installation, that is, delete all files under the installation path, and then use the installation package to install a new installation. + + - Enter "n" to exit the installation. + + Note: If there is an installation mode selection of "op proto", "op impl", "custom. proto" and other files in the future, please enter the corresponding characters according to the prompt information. + + The directory structure after deployment is as follows: ``` ├── opp // OPP directory - │ ├── op_impl - │ ├── built-in - │ ├── custom - │ ├── ai_core - │ ├── tbe + │ ├── vendors // Directory of user-defined operators + │ ├── config.ini // Custom operator priority profile + │ ├── vendor_name1 // Store the user-defined operator deployed by the corresponding vendor. The name is the value of "vendor_name" configured during the compilation of the customized operator installation package. If not configured, the default value is "customize". + │ ├── op_impl + │ ├── ai_core + │ ├── tbe + │ ├── config + │ ├── ${soc_version} // Ascend AI Processor model + │ ├── aic-${sos_version}-ops-info.json // Custom TBE operator info library file + │ ├── impl // Custom TBE operator implementation code + │ ├── xx.py + │ ├── cpu // Directory of AI CPU custom operator implementation file and information library. + │ ├── aicpu_kernel + │ ├── impl + │ ├── libcust_aicpu_kernels.so //Custom AI CPU operator implementation library file │ ├── config - ${soc_version} // Ascend AI Processor model - │ ├── aic-${sos_version}-ops-info.json // Custom TBE operator info library file - │ ├── custom_impl // Custom TBE operator implementation code - │ ├── xx.py - │ ├── vector_core // Reserved directory, which can be ignored - │ ├── cpu // Directory of AI CPU custom operator implementation file and information library. - │ ├── aicpu_kernel - │ ├── custom_impl - │ ├── libcust_aicpu_kernels.so //Custom AI CPU operator implementation library file - │ ├── config - │ ├── cust_aicpu_kernel.json //Custom AI CPU operator information library file - │ ├── framework - │ ├── built-in - │ ├── custom - │ ├── caffe // Directory of the plug-in library of the Caffe custom operator - │ ├── libcust_caffe_parsers.so // Operator plug-in library file, including the parsing functions of custom operator plug-in - │ ├── custom.proto // Original definition file of the custom operator. This file is read during the operator building to obtain the operator original definition. - │ ├── tensorflow // Directory for storing the plug-in library of the TensorFlow custom operator and the configuration file for configuring the NPU's support for the custom operator - │ ├── libcust_tf_parsers.so // Operator plug-in library file - │ ├── libcust_tf_scope_fusion.so // Scope fusion pattern definition library file - │ ├── npu_supported_ops.json // File applicable to Ascend 910 - │ ├── op_proto - │ ├── built-in - │ ├── custom - │ ├── libcust_op_proto.so // Prototype library file of the custom operator + │ ├── cust_aicpu_kernel.json //Custom AI CPU operator information library file + │ ├── vector_core // Reserved directory, which can be ignored + │ ├── framework + │ ├── caffe // Directory of the plug-in library of the Caffe custom operator + │ ├── libcust_caffe_parsers.so // Operator plug-in library file, including the parsing functions of custom operator plug-in + │ ├── custom.proto // Original definition file of the custom operator. This file is read during the operator building to obtain the operator original definition. + │ ├── onnx // Directory of the plug-in library of the ONNC custom operator + │ ├── libcust_onnx_parsers.so + │ ├── tensorflow // Directory for storing the plug-in library of the TensorFlow custom operator and the configuration file for configuring the NPU's support for the custom operator + │ ├── libcust_tf_parsers.so // Operator plug-in library file + │ ├── libcust_tf_scope_fusion.so // Scope fusion pattern definition library file + │ ├── npu_supported_ops.json // File applicable to Ascend 910 + │ ├── op_proto + │ ├── libcust_op_proto.so // Prototype library file of the custom operator + │ ├── vendor_name2 // Custom operator deployed by vendor_name2 ``` Note: You do not need to pay attention to other directories and files during the custom operator deployment. +3. Configure custom operator priority. + When there are custom operators from multiple vendors under the "opp/vendors" directory, you can configure the priority of the custom operators by configuring the "config.ini" file under the "opp/vendors" directory. If there are custom operators of the same OpType under different "vendor_name" directories, the operator under the "vendor_name" directory with higher priority shall prevail. The configuration example of the "config.ini" file is as follows: + ``` + load_priority=vendor_name1,vendor_name2,vendor_name3 + ``` + - "Load_priority": the keyword of priority configuration sequence, which cannot be modified. + - "Vendor_name1, vendor_name2, vendor_name3": the priority sequence of user-defined operator manufacturers, which is arranged from high priority to low priority. + ## Operator ST Verification @@ -318,7 +334,7 @@ To execute a single-operator network test file, perform the following operations 1. Go to the directory where the **xx.py** file is located. 2. Run the following command to execute the single-operator network test code: - **python3.7.5 _xx.py_** + **python3 _xx.py_** TBE operators: Add and ScatterNdAdd diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/README_CN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/README_CN.md index 4342bc343..6a40487f7 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/README_CN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/README_CN.md @@ -54,7 +54,10 @@ Caffe与TensorFlow共存的自定义算子样例工程的目录结构如下所 │ ├── ${Soc Version} //昇腾AI处理器类型 │ ├── xx.ini │ ├── testcase -│ ├── tf_test //基于TensorFlow的算子测试文件目录,此目录下代码仅支持在昇腾910 AI处理器上运行。 +│ ├── tf1.15_test //基于TensorFlow 1.15的算子测试文件目录,此目录下代码仅支持在昇腾910 AI处理器上运行。 +│ ├── op_name //单算子网络测试代码 +│ ├── tf_xx.py +│ ├── tf2.6_test //基于TensorFlow 2.6的算子测试文件目录,此目录下代码仅支持在昇腾910 AI处理器上运行。 │ ├── op_name //单算子网络测试代码 │ ├── tf_xx.py ├── cmake @@ -98,8 +101,6 @@ Caffe与TensorFlow共存的自定义算子样例工程的目录结构如下所 ## 环境要求 - -- 操作系统及架构:CentOS x86_64、CentOS aarch64、Ubuntu 18.04 x86_64、Ubuntu 18.04 aarch64、EulerOS x86、EulerOS aarch64 - python及依赖的库:Python3.7.x(3.7.0 ~ 3.7.11)、Python3.8.x(3.8.0 ~ 3.8.11) - 已完成昇腾AI软件栈的部署。 @@ -116,20 +117,10 @@ Caffe与TensorFlow共存的自定义算子样例工程的目录结构如下所 # 安装nnae包时配置 . ${HOME}/Ascend/nnae/set_env.sh - # 安装fwkplugin包时配置 - . /${HOME}/Ascend/fwkplugin/set_env.sh + # 安装tfplugin包时配置 + . /${HOME}/Ascend/tfplugin/set_env.sh ``` - -- 算子编译依赖Python,以Python3.7.5为例,请以运行用户执行如下命令设置Python3.7.5的相关环境变量。 - - ``` - #用于设置python3.7.5库文件路径 - export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib:$LD_LIBRARY_PATH - #如果用户环境存在多个python3版本,则指定使用python3.7.5版本 - export PATH=/usr/local/python3.7.5/bin:$PATH - ``` - Python3.7.5安装路径请根据实际情况进行替换,您也可以将以上命令写入~/.bashrc文件中,然后执行source ~/.bashrc命令使其立即生效。 ## 算子工程编译 @@ -168,7 +159,7 @@ Caffe与TensorFlow共存的自定义算子样例工程的目录结构如下所 ``` 须知: - Parameter的类型(粗斜体部分)建议保持唯一,不与内置caffe.proto(“compiler/include/proto/caffe.proto”)定义重复。 + Parameter的类型(粗斜体部分)建议保持唯一,不与内置caffe.proto(“atc/include/proto/caffe.proto”)定义重复。 样例代码的custom.proto文件中已包含样例中样例中的自定义Caffe算子的定义,若有其他自定义算子,请基于此文件追加。 ``` @@ -207,7 +198,9 @@ Caffe与TensorFlow共存的自定义算子样例工程的目录结构如下所 ``` - - AICPU\_SOC\_VERSION:昇腾AI处理器的类型,请配置为AI CPU组件安装路径中“opp/op_impl/built-in/aicpu/aicpu_kernel/lib”路径下的文件夹名称,即“libcpu_kernels_context.a”与“libcpu_kernels_v1.0.1.so”所在文件夹的名称。 + - AICPU\_SOC\_VERSION:请选择实际硬件平台对应的昇腾AI处理器类型,请配置为CANN软件安装后文件存储路径的“opp/built-in/op_impl/aicpu/aicpu_kernel/lib”路径下代表昇腾AI处理器类型的文件夹名称,即AI CPU相关库文件所在文件夹的名称。 + + - vendor_name:标识自定义算子所属厂商的名称,开发者可自定义,默认值为“customize”。 3. 执行算子工程编译。 @@ -233,11 +226,7 @@ Caffe与TensorFlow共存的自定义算子样例工程的目录结构如下所 编译成功后,会在当前目录下创建build\_out目录,并在build\_out目录下生成自定义算子安装包**custom\_opp\__\__.run**。 - **说明:** - - - 若重新进行工程编译,请先执行./build.sh clean命令进行编译文件的清理。 - - 若您开发的自定义算子既包含TBE算子,又包含AI CPU算子,请选择同时编译,生成一个自定义算子安装包。因为当前版本,仅支持安装一个自定义算子安装包,后面安装的自定义算子包会覆盖之前安装的算子包。 - + **说明:若重新进行工程编译,请先执行./build.sh clean命令进行编译文件的清理。** ## 算子部署 @@ -248,48 +237,68 @@ Caffe与TensorFlow共存的自定义算子样例工程的目录结构如下所 **./custom\_opp\__\__.run** - 命令执行成功后,会将编译生成的自定义算子相关文件部署到opp对应目录下的custom路径下,部署后目录结构示例如下所示: + - 安装成功后,会将编译生成的自定义算子相关文件部署到opp/vendors/__目录下,其中 __为算子工程编译时build.sh脚本中字段“vendor_name”的取值,默认为“customize”。 + - 若算子库中已存在相同“vendor_name”的自定义算子,会出现类似如下提示信息(以更新framework为例): + ``` + [ops_custom]upgrade framework + caffe onnx tensorflow [INFO]: has old version in /usr/local/Ascend/latest/opp_x86_64-linux/vendors/customize/framework: + - Overlay Installation, please enter:[o] + - Replace directory installation, please enter:[r] + - Do not install, please enter:[n] + ``` + - 输入“o”,代表覆盖安装,即若安装包中文件与已存在文件名称相同,使用安装包中文件替换原文件;若安装包中不包含已存在文件,则已存在文件保留。 + - 输入“r”,代表全新安装,即删除安装路径下的所有文件,然后使用安装包全新安装。 + - 输入“n”,代表退出安装。 + + 说明:后续若存在“op proto”、“op impl”、“custom.proto”等安装模式的选择,请分别根据提示信息输入相应的字符。 + + 部署后目录结构示例如下所示: ``` ├── opp //算子库目录 - │ ├── op_impl - │ ├── built-in - │ ├── custom - │ ├── ai_core - │ ├── tbe + │ ├── vendors //自定义算子所在目录 + │ ├── config.ini //自定义算子优先级配置文件 + │ ├── vendor_name1 // 存储厂商部署的自定义算子,此名字为编译自定义算子安装包时配置的vendor_name,若未配置,默认值未customize + │ ├── op_impl + │ ├── ai_core + │ ├── tbe + │ ├── config + │ ├── ${soc_version} //昇腾AI处理器类型 + │ ├── aic-${soc_version}-ops-info.json //TBE自定义算子信息库 + │ ├── impl //TBE自定义算子实现代码文件 + │ ├── xx.py + │ ├── cpu //AI CPU自定义算子实现库及算子信息库所在目录 + │ ├── aicpu_kernel + │ ├── impl + │ ├── libcust_aicpu_kernels.so //AI CPU算子实现库文件 │ ├── config - │ ├── ${soc_version} //昇腾AI处理器类型 - │ ├── aic-${soc_version}-ops-info.json //TBE自定义算子信息库 - │ ├── custom_impl //TBE自定义算子实现代码文件 - │ ├── xx.py - │ ├── vector_core //此目录预留,无需关注 - │ ├── cpu //AI CPU自定义算子实现库及算子信息库所在目录 - │ ├── aicpu_kernel - │ ├── custom_impl - │ ├── libcust_aicpu_kernels.so //AI CPU算子实现库文件 - │ ├── config - │ ├── cust_aicpu_kernel.json //AI CPU算子信息库 - │ ├── framework - │ ├── built-in - │ ├── custom - │ ├── caffe //存放Caffe框架的自定义算子插件库 - │ ├── libcust_caffe_parsers.so //算子插件库文件,包含了自定义算子的插件解析函数 - │ ├── custom.proto //自定义算子的原始定义,算子编译过程中会读取此文件自动解析算子原始定义 - │ ├── onnx //存放ONNX框架的自定义算子插件库 - │ ├── libcust_onnx_parsers.so //算子插件库文件,包含了自定义算子的插件解析函数 - │ ├── tensorflow //存放TensorFlow框架的自定义算子插件库及npu对相关自定义算子支持度的配置文件 - │ ├── libcust_tf_parsers.so //算子插件库文件 - │ ├── libcust_tf_scope_fusion.so //scope融合规则定义库文件 - │ ├── npu_supported_ops.json //Ascend 910场景下使用的文件 - │ ├── op_proto - │ ├── built-in - │ ├── custom - │ ├── libcust_op_proto.so //自定义算子原型库文件 - + │ ├── cust_aicpu_kernel.json //AI CPU算子信息库 + │ ├── vector_core //此目录预留,无需关注 + │ ├── framework + │ ├── caffe //存放Caffe框架的自定义算子插件库 + │ ├── libcust_caffe_parsers.so //算子插件库文件,包含了自定义算子的插件解析函数 + │ ├── custom.proto //自定义算子的原始定义,算子编译过程中会读取此文件自动解析算子原始定义 + │ ├── onnx //存放ONNX框架的自定义算子插件库 + │ ├── libcust_onnx_parsers.so //算子插件库文件,包含了自定义算子的插件解析函数 + │ ├── tensorflow //存放TensorFlow框架的自定义算子插件库及npu对相关自定义算子支持度的配置文件 + │ ├── libcust_tf_parsers.so //算子插件库文件 + │ ├── libcust_tf_scope_fusion.so //scope融合规则定义库文件 + │ ├── npu_supported_ops.json //Ascend 910场景下使用的文件 + │ ├── op_proto + │ ├── libcust_op_proto.so //自定义算子原型库文件 + │ ├── vendor_name2 //存储厂商vendor_name2部署的自定义算子 ``` - 注:其他目录与文件,自定义算子部署无需关注。 + 注:其他目录与文件,开发者无需关注。 +3. 配置自定义算子优先级。 + 当“opp/vendors”目录下存在多个厂商的自定义算子时,您可通过配置“opp/vendors”目录下的“config.ini”文件,配置自定义算子的优先级,若不同的“ _vendor_name_ ”目录下存在相同的OpType的自定义算子,则以优先级高的“ _vendor_name_ ”目录下的算子为准。 + “config.ini”文件的配置示例如下: + ``` + load_priority=vendor_name1,vendor_name2,vendor_name3 + ``` + - "load_priority":优先级配置序列的关键字,不允许修改。 + - “vendor_name1,vendor_name2,vendor_name3”:自定义算子厂商的优先级序列,按照优先级从高到低的顺序进行排列。 ## 算子ST验证 @@ -322,7 +331,7 @@ TBE算子:Add、ScatterNdAdd,单算子网络验证文件可参见“tbe/test 1. 进入xx.py文件所在目录。 2. 执行如下命令执行单算子网络测试代码。 - **python3.7.5 _xx.py_** + **python3 _xx.py_** TBE算子:Add与ScatterNdAdd @@ -416,7 +425,7 @@ TBE算子:Add、ScatterNdAdd,单算子网络验证文件可参见“tbe/test 其中,soc\_version:昇腾AI处理器的型号,请根据实际情况替换。 - 可从ATC安装路径下的“compiler/data/platform\_config”目录下查看支持的昇腾AI处理器的类型,对应“\*.ini”文件的名字即为{soc\_version\}。 + 可从ATC安装路径下的“atc/data/platform\_config”目录下查看支持的昇腾AI处理器的类型,对应“\*.ini”文件的名字即为{soc\_version\}。 3. 结果验证。 1. 在INFO日志中可以看到pass的设置情况: diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/build.sh b/cplusplus/level1_single_api/4_op_dev/1_custom_op/build.sh index 74aafba1c..a76e7fd89 100755 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/build.sh +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/build.sh @@ -2,20 +2,10 @@ # Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. ###### Environment variable settings, need to set according to your own device ###### -# ASCEND_OPP_PATH: The installation path of the OPP package, where "/usr/local/Ascend/ascend-toolkit/latest/opp" is the -# default installation path. If user defines the installation path, please modify it. -# Uncomment and modify it when you specified installation path of OPP. -# export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp - -# ASCEND_AICPU_PATH: The installation path of the AICPU package, where "/usr/local/Ascend/ascend-toolkit/latest" is the -# default installation path. If user defines the installation path, please modify it. -# Uncomment and modify it when you specified installation path of AICPU. -# export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest - -# ASCEND_TENSOR_COMPILER_INCLUDE: The path of the header file of the ATC package, where "/usr/local/Ascend/ascend-toolkit/latest/compiler/include" is the -# default installation path. If user defines the installation path, please modify it. -# Uncomment and modify it when you specified installation path of ATC. -# export ASCEND_TENSOR_COMPILER_INCLUDE=/usr/local/Ascend/ascend-toolkit/latest/compiler/include +# ASCEND_TENSOR_COMPILER_INCLUDE: The path of the header file of the Compiler package, where "/usr/local/Ascend/ascend-toolkit/latest/include" is +# the default installation path. If user defines the installation path, please modify it. +# Uncomment and modify it when you specified installation path of Compiler. +# export ASCEND_TENSOR_COMPILER_INCLUDE=/usr/local/Ascend/ascend-toolkit/latest/include # TOOLCHAIN_DIR: The path of the cross compilation tool, where "/usr/local/Ascend/ascend-toolkit/latest/toolkit/toolchain/hcc" is the # default installation path. If user defines the installation path, please modify it. @@ -41,8 +31,17 @@ export COMPILE_KERNEL_TARGET=cpu_tbe # The version of soc. # export AICPU_SOC_VERSION=Ascend910 +# Indicates the name of the supplier to which the operator belongs. This field can be customized. +vendor_name=customize + ###### The following logic can be used without modification ###### +# If vendor_name is not specified. Use the default value customize. +if [[ "$vendor_name" = "" ]] || [[ ! "vendor_name" ]]; then + vendor_name=customize +fi +export OPP_CUSTOM_VENDOR=$vendor_name + # parse input parameters clean=n compile_component=$COMPILE_KERNEL_TARGET @@ -115,6 +114,21 @@ if [ "x$clean" == "xy" ] 2>/dev/null; then log "[INFO] Clean successfully." exit 0 fi +#vendor_name output script/install.sh +vendor_name_output() { + scripts_file=$PWD/$1 + found_vendor_name_field=$(grep -e "vendor_name=" "$scripts_file") + found_vendor_name=$(grep -e "vendor_name=" "$scripts_file" | cut --only-delimited -d"=" -f2-) + if [[ $found_vendor_name_field = "" ]]; then + sed -i "1 a vendor_name=$vendor_name" $scripts_file + elif [ $found_vendor_name_field != "" ] && [ $found_vendor_name != $vendor_name ]; then + sed -i "s/$found_vendor_name_field/vendor_name=$vendor_name/g" $scripts_file + fi +} + +vendor_name_output scripts/install.sh +vendor_name_output scripts/upgrade.sh +vendor_name_output scripts/ide_customops_install.sh # specify compile target if [ "x$compile_component" == "xcpu" ] 2>/dev/null; then diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/config.cmake b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/config.cmake index 002d4df9b..d19cc0fe1 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/config.cmake +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/config.cmake @@ -2,9 +2,10 @@ # set compile option -std=c++11 set(CMAKE_CXX_STANDARD 11) + # set compile option -fPIC set(CMAKE_POSITION_INDEPENDENT_CODE ON) - +set(OPP_CUSTOM_VENDOR "$ENV{OPP_CUSTOM_VENDOR}") set(TOP_DIR ${CMAKE_SOURCE_DIR}/../..) if (NOT DEFINED ASCEND_TENSOR_COMPILER_INCLUDE) @@ -39,39 +40,39 @@ endif () set(RUN_TARGET "custom_opp_${SYSTEM_INFO}.run") message( STATUS "RUN_TARGET=${RUN_TARGET}") -set(PROJECT_DIR custom) +set(PROJECT_DIR "vendors/${OPP_CUSTOM_VENDOR}") set(OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/makepkg) message(STATUS "OUT_DIR=${OUT_DIR}") set(TF_PLUGIN_TARGET "cust_tf_parsers") -set(TF_PLUGIN_TARGET_OUT_DIR ${OUT_DIR}/packages/framework/${PROJECT_DIR}/tensorflow/) +set(TF_PLUGIN_TARGET_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/framework/tensorflow/) set(ONNX_PLUGIN_TARGET "cust_onnx_parsers") -set(ONNX_PLUGIN_TARGET_OUT_DIR ${OUT_DIR}/packages/framework/${PROJECT_DIR}/onnx/) +set(ONNX_PLUGIN_TARGET_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/framework/onnx/) set(TF_SCOPE_FUSION_PASS_TARGET "cust_tf_scope_fusion") -set(TF_SCOPE_FUSION_PASS_TARGET_OUT_DIR ${OUT_DIR}/packages/framework/${PROJECT_DIR}/tensorflow/) +set(TF_SCOPE_FUSION_PASS_TARGET_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/framework/tensorflow/) set(CAFFE_PARSER_TARGET "_caffe_parser") set(CAFFE_PLUGIN_TARGET "cust_caffe_parsers") -set(CAFFE_PLUGIN_TARGET_OUT_DIR ${OUT_DIR}/packages/framework/${PROJECT_DIR}/caffe/) +set(CAFFE_PLUGIN_TARGET_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/framework/caffe/) set(OP_PROTO_TARGET "cust_op_proto") -set(OP_PROTO_TARGET_OUT_DIR ${OUT_DIR}/packages/op_proto/${PROJECT_DIR}/) +set(OP_PROTO_TARGET_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/op_proto/) set(AIC_FUSION_PASS_TARGET "cust_aic_fusion_pass") -set(AIC_FUSION_PASS_TARGET_OUT_DIR ${OUT_DIR}/packages/fusion_pass/${PROJECT_DIR}/ai_core) +set(AIC_FUSION_PASS_TARGET_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/fusion_pass/ai_core) set(AIV_FUSION_PASS_TARGET "cust_aiv_fusion_pass") -set(AIV_FUSION_PASS_TARGET_OUT_DIR ${OUT_DIR}/packages/fusion_pass/${PROJECT_DIR}/vector_core) +set(AIV_FUSION_PASS_TARGET_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/fusion_pass/vector_core) -set(AIC_OP_INFO_CFG_OUT_DIR ${OUT_DIR}/packages/op_impl/${PROJECT_DIR}/ai_core/tbe/config) -set(AIV_OP_INFO_CFG_OUT_DIR ${OUT_DIR}/packages/op_impl/${PROJECT_DIR}/vector_core/tbe/config/) +set(AIC_OP_INFO_CFG_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/op_impl/ai_core/tbe/config) +set(AIV_OP_INFO_CFG_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/op_impl/vector_core/tbe/config/) set(AICPU_CONFIG_JSON_TARGET "aicpu_config_json") -set(AICPU_OP_INFO_CFG_OUT_DIR ${OUT_DIR}/packages/op_impl/${PROJECT_DIR}/cpu/config) -set(AICPU_OP_IMPL_OUT_DIR ${OUT_DIR}/packages/op_impl/${PROJECT_DIR}/cpu/aicpu_kernel/custom_impl/) +set(AICPU_OP_INFO_CFG_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/op_impl/cpu/config) +set(AICPU_OP_IMPL_OUT_DIR ${OUT_DIR}/packages/${PROJECT_DIR}/op_impl/cpu/aicpu_kernel/impl/) set(INI_2_JSON_PY "${CMAKE_SOURCE_DIR}/cmake/util/parse_ini_to_json.py") set(AICPU_INI_2_JSON_PY "${CMAKE_SOURCE_DIR}/cmake/util/aicpu_parser_ini.py") diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/gen_impl_and_mrege_json.sh b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/gen_impl_and_mrege_json.sh index 967d5da48..725b9fa0f 100755 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/gen_impl_and_mrege_json.sh +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/gen_impl_and_mrege_json.sh @@ -2,6 +2,7 @@ project_path=$1 build_path=$2 +vendor_name=$OPP_CUSTOM_VENDOR if [[ ! -d "$project_path" ]]; then echo "[ERROR] No projcet path is provided" exit 1 @@ -13,15 +14,15 @@ if [[ ! -d "$build_path" ]]; then fi # copy ai_core operators implements -tbe_impl_files_num=$(ls $project_path/tbe/custom_impl/*.py 2> /dev/null | wc -l) +tbe_impl_files_num=$(ls $project_path/tbe/impl/*.py 2> /dev/null | wc -l) if [[ "$tbe_impl_files_num" -gt 0 ]];then - cp -f ${project_path}/tbe/custom_impl/*.py ${build_path}/makepkg/packages/op_impl/custom/ai_core/tbe/custom_impl - cp -f ${project_path}/tbe/custom_impl/*.py ${build_path}/makepkg/packages/op_impl/custom/vector_core/tbe/custom_impl + cp -f ${project_path}/tbe/impl/*.py ${build_path}/makepkg/packages/vendors/$vendor_name/op_impl/ai_core/tbe/impl + cp -f ${project_path}/tbe/impl/*.py ${build_path}/makepkg/packages/vendors/$vendor_name/op_impl/vector_core/tbe/impl fi # copy aicpu kernel so operators if [[ -d "${project_path}/cpukernel/aicpu_kernel_lib" ]]; then - cp -f ${project_path}/cpukernel/aicpu_kernel_lib/* ${build_path}/makepkg/packages/op_impl/custom/cpu/aicpu_kernel/custom_impl + cp -f ${project_path}/cpukernel/aicpu_kernel_lib/* ${build_path}/makepkg/packages/vendors/$vendor_name/op_impl/cpu/aicpu_kernel/impl rm -rf ${project_path}/cpukernel/aicpu_kernel_lib fi @@ -41,16 +42,16 @@ fi aicpu_filter_file=${build_path}/framework/op_info_cfg/aicpu_kernel/npu_supported_ops.json aicore_filter_file=${build_path}/framework/op_info_cfg/ai_core/npu_supported_ops.json if [[ -f "${aicpu_filter_file}" ]] && [[ ! -f "${aicore_filter_file}" ]]; then - cp $aicpu_filter_file ${build_path}/makepkg/packages/framework/custom/tensorflow + cp $aicpu_filter_file ${build_path}/makepkg/packages/vendors/$vendor_name/framework/tensorflow fi if [[ -f "${aicore_filter_file}" ]] && [[ ! -f "${aicpu_filter_file}" ]]; then - cp $aicore_filter_file ${build_path}/makepkg/packages/framework/custom/tensorflow + cp $aicore_filter_file ${build_path}/makepkg/packages/vendors/$vendor_name/framework/tegnsorflow fi if [[ -f "${aicore_filter_file}" ]] && [[ -f "${aicpu_filter_file}" ]]; then chmod u+w ${aicpu_filter_file} python3.7.5 ${project_path}/cmake/util/insert_op_info.py ${aicore_filter_file} ${aicpu_filter_file} chmod u-w ${aicpu_filter_file} - cp $aicpu_filter_file ${build_path}/makepkg/packages/framework/custom/tensorflow + cp $aicpu_filter_file ${build_path}/makepkg/packages/vendors/$vendor_name/framework/tensorflow fi diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/makeself/makeself-header.sh b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/makeself/makeself-header.sh index cd6b9f67e..9585198e4 100755 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/makeself/makeself-header.sh +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/makeself/makeself-header.sh @@ -8,6 +8,7 @@ if test "$KEEP_UMASK" = n; then umask 077 fi +OPP_CUSTOM_VENDOR="$OPP_CUSTOM_VENDOR" CRCsum="$CRCsum" MD5="$MD5sum" SHA="$SHAsum" @@ -249,14 +250,23 @@ MS_Check() MS_Uninstall() { - rm -rf \${ASCEND_OPP_PATH}/op_impl/custom/* - rm -rf \${ASCEND_OPP_PATH}/framework/custom/* - rm -rf \${ASCEND_OPP_PATH}/op_proto/custom/* - if [ "`ls -A \${ASCEND_OPP_PATH}/op_impl/custom`" = "" ] && [ "`ls -A \${ASCEND_OPP_PATH}/framework/custom`" = "" ] && [ "`ls -A \${ASCEND_OPP_PATH}/op_proto/custom`" = "" ];then - echo "uninstall SUCCESS." + if test x"\$OPP_CUSTOM_VENDOR" = xcustomize; then + rm -rf \${ASCEND_OPP_PATH}/vendors/$OPP_CUSTOM_VENDOR/op_impl + rm -rf \${ASCEND_OPP_PATH}/vendors/$OPP_CUSTOM_VENDOR/framework + rm -rf \${ASCEND_OPP_PATH}/vendors/$OPP_CUSTOM_VENDOR/op_proto + if [ ! -d "\${ASCEND_OPP_PATH}/vendors/$OPP_CUSTOM_VENDOR/op_impl" ] && [ ! -d "\${ASCEND_OPP_PATH}/vendors/$OPP_CUSTOM_VENDOR/framework" ] && [ ! -d "\${ASCEND_OPP_PATH}/vendors/$OPP_CUSTOM_VENDOR/op_proto" ];then + echo "uninstall SUCCESS." + else + echo "uninstall FAIL." + fi else - echo "uninstall FAIL." - fi + rm -rf \${ASCEND_OPP_PATH}/vendors/$OPP_CUSTOM_VENDOR + if [ ! -d "\${ASCEND_OPP_PATH}/vendors/$OPP_CUSTOM_VENDOR" ];then + echo "uninstall SUCCESS." + else + echo "uninstall FAIL." + fi + fi } UnTAR() diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/merge_aicpu_info_json.sh b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/merge_aicpu_info_json.sh index fb8111fb0..7b74dad94 100755 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/merge_aicpu_info_json.sh +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cmake/util/merge_aicpu_info_json.sh @@ -2,6 +2,7 @@ project_path=$1 build_path=$2 +vendor_name=$OPP_CUSTOM_VENDOR echo $@ if [[ ! -d "$project_path" ]]; then echo "[ERROR] No projcet path is provided" @@ -17,9 +18,9 @@ if [[ ! -d "$ASCEND_OPP_PATH" ]]; then echo "[ERROR] No opp install path is provided" exit 1 fi -custom_exist_info_json=$ASCEND_OPP_PATH/op_impl/custom/cpu/config/cust_aicpu_kernel.json -custom_new_info_json=$build_path/makepkg/packages/op_impl/custom/cpu/config/cust_aicpu_kernel.json -temp_info_json=$build_path/makepkg/packages/op_impl/custom/cpu/config/temp_cust_aicpu_kernel.json +custom_exist_info_json=$ASCEND_OPP_PATH/vendors/$vendor_name/op_impl/cpu/config/cust_aicpu_kernel.json +custom_new_info_json=$build_path/makepkg/packages/vendors/$vendor_name/op_impl/cpu/config/cust_aicpu_kernel.json +temp_info_json=$build_path/makepkg/packages/vendors/$vendor_name/op_impl/cpu/config/temp_cust_aicpu_kernel.json if [[ -f "$custom_exist_info_json" ]] && [[ -f "$custom_new_info_json" ]]; then cp -f $custom_exist_info_json $temp_info_json diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cpukernel/CMakeLists.txt b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cpukernel/CMakeLists.txt index 3f6879739..b0397a9fd 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/cpukernel/CMakeLists.txt +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/cpukernel/CMakeLists.txt @@ -116,7 +116,7 @@ if("x${ASCEND_AICPU_PATH}" STREQUAL "x") message(FATAL_ERROR "ENV ASCEND_AICPU_PATH is not set") endif() -include_directories(${ASCEND_OPP_PATH}/op_impl/built-in/aicpu/aicpu_kernel/inc) +include_directories(${ASCEND_OPP_PATH}/built-in/op_impl/aicpu/aicpu_kernel/inc) # travers subdirectory SUBDIRLIST(${CMAKE_CURRENT_SOURCE_DIR}/impl/third_party) @@ -133,10 +133,10 @@ if((AICPU_SOC_VERSION STREQUAL Ascend910B) OR (AICPU_SOC_VERSION STREQUAL Ascend set(AICPU_SOC_VERSION Ascend) endif() -if(EXISTS "${ASCEND_AICPU_PATH}/opp/op_impl/built-in/aicpu/aicpu_kernel/lib/${AICPU_SOC_VERSION}/libascend_protobuf.a") +if(EXISTS "${ASCEND_AICPU_PATH}/opp/built-in/op_impl/aicpu/aicpu_kernel/lib/${AICPU_SOC_VERSION}/libascend_protobuf.a") target_link_libraries(${AICPU_KERNEL_TARGET} PRIVATE -Wl,--whole-archive - ${ASCEND_AICPU_PATH}/opp/op_impl/built-in/aicpu/aicpu_kernel/lib/${AICPU_SOC_VERSION}/libascend_protobuf.a + ${ASCEND_AICPU_PATH}/opp/built-in/op_impl/aicpu/aicpu_kernel/lib/${AICPU_SOC_VERSION}/libascend_protobuf.a -Wl,--no-whole-archive -s -Wl,-Bsymbolic @@ -146,23 +146,23 @@ else() message(FATAL_ERROR "Can not find libascend_protobuf.a in environment. Please check whether the path of libascend_protobuf.a is correct or not") endif() -if(EXISTS "${ASCEND_AICPU_PATH}/opp/op_impl/built-in/aicpu/aicpu_kernel/lib/${AICPU_SOC_VERSION}/libcpu_kernels_context.a") +if(EXISTS "${ASCEND_AICPU_PATH}/opp/built-in/op_impl/aicpu/aicpu_kernel/lib/${AICPU_SOC_VERSION}/libcpu_kernels_context.a") target_link_libraries(${AICPU_KERNEL_TARGET} PRIVATE -Wl,--whole-archive - ${ASCEND_AICPU_PATH}/opp/op_impl/built-in/aicpu/aicpu_kernel/lib/${AICPU_SOC_VERSION}/libcpu_kernels_context.a + ${ASCEND_AICPU_PATH}/opp/built-in/op_impl/aicpu/aicpu_kernel/lib/${AICPU_SOC_VERSION}/libcpu_kernels_context.a -Wl,--no-whole-archive ) else() - if(EXISTS "${ASCEND_AICPU_PATH}/opp/op_impl/built-in/aicpu/aicpu_kernel/lib/libcpu_kernels_context.a") + if(EXISTS "${ASCEND_AICPU_PATH}/opp/built-in/op_impl/aicpu/aicpu_kernel/lib/libcpu_kernels_context.a") target_link_libraries(${AICPU_KERNEL_TARGET} PRIVATE -Wl,--whole-archive - ${ASCEND_AICPU_PATH}/opp/op_impl/built-in/aicpu/aicpu_kernel/lib/libcpu_kernels_context.a + ${ASCEND_AICPU_PATH}/opp/built-in/op_impl/aicpu/aicpu_kernel/lib/libcpu_kernels_context.a -Wl,--no-whole-archive ) - elseif(EXISTS "${ASCEND_OPP_PATH}/op_impl/built-in/aicpu/aicpu_kernel/lib/device/libcpu_kernels_context.so") + elseif(EXISTS "${ASCEND_OPP_PATH}/built-in/op_impl/aicpu/aicpu_kernel/lib/device/libcpu_kernels_context.so") target_link_libraries(${AICPU_KERNEL_TARGET} PRIVATE -Wl,--whole-archive - ${ASCEND_OPP_PATH}/op_impl/built-in/aicpu/aicpu_kernel/lib/device/libcpu_kernels_context.so + ${ASCEND_OPP_PATH}/built-in/op_impl/aicpu/aicpu_kernel/lib/device/libcpu_kernels_context.so -Wl,--no-whole-archive ) else() diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Add_CN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Add_CN.md index 9ee897835..d909e0dc2 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Add_CN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Add_CN.md @@ -54,7 +54,7 @@ Add算子实现了两个数据相加,返回相加结果的功能,如下所 - 算子实现 - Add算子仅支持float16, float32, int32三种数据类型,所以需要对算子的输入数据进行校验;由于Add算子允许两个输入数据的shape不同,但算子计算接口**te.lang.cce.vadd**要求两输入shape相同,因此需要对算子两个输入的shape进行广播并对其进行校验,算子实现代码可参见[add_dsl.py](../tbe/custom_impl/add_dsl.py)。 + Add算子仅支持float16, float32, int32三种数据类型,所以需要对算子的输入数据进行校验;由于Add算子允许两个输入数据的shape不同,但算子计算接口**te.lang.cce.vadd**要求两输入shape相同,因此需要对算子两个输入的shape进行广播并对其进行校验,算子实现代码可参见[add_dsl.py](../tbe/impl/add_dsl.py)。 - 算子原型定义 diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Add_EN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Add_EN.md index bc3a01791..9ce6064a1 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Add_EN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Add_EN.md @@ -54,7 +54,7 @@ The Add operator returns the sum of its operands, as shown in the following figu - Operator Implementation - The Add operator supports only three data types: float16, float32, and int32. Therefore, the input data type needs to be verified. The two inputs may have different shapes. This scenario is supported by the Add operator, but not supported by the operator compute API **te.lang.cce.vadd\(\)**. As a result, the two input shapes need to be broadcast and verified. For details about the operator implementation code, see [add_dsl.py](../tbe/custom_impl/add_dsl.py). + The Add operator supports only three data types: float16, float32, and int32. Therefore, the input data type needs to be verified. The two inputs may have different shapes. This scenario is supported by the Add operator, but not supported by the operator compute API **te.lang.cce.vadd\(\)**. As a result, the two input shapes need to be broadcast and verified. For details about the operator implementation code, see [add_dsl.py](../tbe/impl/add_dsl.py). - Operator Prototype Definition diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Conv2d_CN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Conv2d_CN.md index 8480a2608..29724e4bf 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Conv2d_CN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Conv2d_CN.md @@ -62,7 +62,7 @@ 3. 调用conv2d\(\)实现二维卷积计算。 4. 调用fixpipe\(\)实现计算结果数据的搬运。 - 完整的实现代码请参见[conv2d\_tik.py](../tbe/custom_impl/conv2d_tik.py)。 + 完整的实现代码请参见[conv2d\_tik.py](../tbe/impl/conv2d_tik.py)。 - 算子原型定义 diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Conv2d_EN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Conv2d_EN.md index 2e971cd77..680983d8a 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Conv2d_EN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Conv2d_EN.md @@ -62,7 +62,7 @@ Implemented by using TIK, the Conv2d operator performs the convolution 2-D opera 3. Call **conv2d\(\)** to implement the 2-D convolution operation. 4. Call **fixpipe\(\)** to move the compute result data. - For details about the complete implementation code, see [conv2d\_tik.py](../tbe/custom_impl/conv2d_tik.py). + For details about the complete implementation code, see [conv2d\_tik.py](../tbe/impl/conv2d_tik.py). - Operator Prototype Definition diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/LeakyRelu_CN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/LeakyRelu_CN.md index 18580f5af..d0e7aa93a 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/LeakyRelu_CN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/LeakyRelu_CN.md @@ -64,7 +64,7 @@ LeakyRelu算子的数学表达式如下所示: - 算子实现 - 算子计算函数的实现逻辑如下所示,完整的代码实现请参见[leaky\_relu\_demo.py](../tbe/custom_impl/leaky_relu_demo.py)。 + 算子计算函数的实现逻辑如下所示,完整的代码实现请参见[leaky\_relu\_demo.py](../tbe/impl/leaky_relu_demo.py)。 1. 当negative\_slope为“0”时,输出y取输入x与0之间的较大值。 - 如果输入数据的类型为float16与int8,可直接调用te.lang.cce.vrelu\(x\)接口进行计算。 diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/LeakyRelu_EN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/LeakyRelu_EN.md index c22b50903..47b1d13c0 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/LeakyRelu_EN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/LeakyRelu_EN.md @@ -64,7 +64,7 @@ The mathematical expression of the LeakyRelu operator is as follows: - Operator Implementation - The implementation logic of the operator calculation function is as follows. For details about the complete code implementation, see [leaky\_relu\_demo.py](../tbe/custom_impl/leaky_relu_demo.py). + The implementation logic of the operator calculation function is as follows. For details about the complete code implementation, see [leaky\_relu\_demo.py](../tbe/impl/leaky_relu_demo.py). 1. When **negative\_slope** is **0**, the output **y** is the larger value between the input **x** and 0. - If the input data type is float16 or int8, the **te.lang.cce.vrelu\(x\)** API can be called for compute process. diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Matmul_CN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Matmul_CN.md index a20c16d59..61641d8c8 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Matmul_CN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Matmul_CN.md @@ -58,7 +58,7 @@ - 算子实现 - MatmulTik算子的实现代码请参见[matmul\_tik.py](../tbe/custom_impl/matmul_tik.py)。 + MatmulTik算子的实现代码请参见[matmul\_tik.py](../tbe/impl/matmul_tik.py)。 - 算子原型定义 diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Matmul_EN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Matmul_EN.md index 37307a7e0..a0d9c669b 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Matmul_EN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Matmul_EN.md @@ -52,7 +52,7 @@ Implemented by using TIK, the Matmul operator performs matrix multiplication on - Operator Implementation - For details about the implementation code of the MatmulTik operator, see [matmul\_tik.py](../tbe/custom_impl/matmul_tik.py). + For details about the implementation code of the MatmulTik operator, see [matmul\_tik.py](../tbe/impl/matmul_tik.py). - Operator Prototype Definition diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Permute_CN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Permute_CN.md index 14ea3f2ea..332b64679 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Permute_CN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Permute_CN.md @@ -53,7 +53,7 @@ - 算子实现 - PermuteTik算子的实现代码请参见[permute\_tik.py](../tbe/custom_impl/permute_tik.py),计算函数的实现逻辑如下所示: + PermuteTik算子的实现代码请参见[permute\_tik.py](../tbe/impl/permute_tik.py),计算函数的实现逻辑如下所示: 1. 定义Permute类,并在初始化函数中初始化后续计算用到的参数。核心计算主要是计算每个输入的shape的大小,申请Global Memory大小。通过tbe\_platform.cce\_conf.get\_soc\_spec\(tbe\_platform.cce\_conf.UB\_SIZE\)接口获取到UB的实际物理空间。后续的步骤中,我们还会使用这些数据来计算data\_move、vec\_trans\_scatter等接口的参数。设置独立的tiling模块,将其与算子计算逻辑分离可以很好的做到算子的shape泛化。对于不同的shape,我们可以在不改变计算逻辑的情况下,只改变tiling参数来优化搬运和计算的次数,来做到泛化和高性能。 diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Permute_EN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Permute_EN.md index e107e2e4a..692bf7c0a 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Permute_EN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Permute_EN.md @@ -53,7 +53,7 @@ Implemented by using TIK, the Permute operator is used to permute the dimension - Operator Implementation - For details about the implementation code of the PermuteTik operator, see [permute\_tik.py](../tbe/custom_impl/permute_tik.py). The implementation logic of the calculation function is as follows: + For details about the implementation code of the PermuteTik operator, see [permute\_tik.py](../tbe/impl/permute_tik.py). The implementation logic of the calculation function is as follows: 1. Define the **Permute** class and initialize the parameters used for subsequent computation in the initialization function. The key is to compute the size of each input shape and memory size to be allocated in the Global Memory. Obtain the actual physical space of the UB by calling **tbe\_platform.cce\_conf.get\_soc\_spec\(tbe\_platform.cce\_conf.UB\_SIZE\)**. In subsequent steps, the calculated tiling parameters \(including the shape size and size of the UB\) will be used to configure the parameters of APIs such as **data\_move** and **vec\_trans\_scatter**. Separate the tiling logic from the operator compute logic to implement shape generalization of the operator. In this way, data with different shapes can be computed in the same compute logic. You only need to change the tiling parameters to optimize the number of movement and compute process times, thereby achieving generalization and high performance. diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/ScatterNdAdd_CN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/ScatterNdAdd_CN.md index 346e060b6..d868d626d 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/ScatterNdAdd_CN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/ScatterNdAdd_CN.md @@ -120,7 +120,7 @@ ScatterNdAdd算子通过对输入数据中的单个值或切片应用稀疏算 - 算子实现 - ScatterNdAdd的算子实现的关键点是进行算子schedule策略的实现,包含tiling参数的计算、多核实现等,完整的实现代码请参见[scatter\_nd\_add.py](../tbe/custom_impl/scatter_nd_add.py)。 + ScatterNdAdd的算子实现的关键点是进行算子schedule策略的实现,包含tiling参数的计算、多核实现等,完整的实现代码请参见[scatter\_nd\_add.py](../tbe/impl/scatter_nd_add.py)。 1. 定义Scatter类,并在初始化函数中进行tiling参数计算。 diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Upsample_CN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Upsample_CN.md index d60f478c7..12a7c7e3b 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Upsample_CN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Upsample_CN.md @@ -50,7 +50,7 @@ - 算子实现 - UpsampleTik算子的实现代码请参见[upsample\_tik.py](../tbe/custom_impl/upsample_tik.py)。 + UpsampleTik算子的实现代码请参见[upsample\_tik.py](../tbe/impl/upsample_tik.py)。 - 算子原型定义 diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Upsample_EN.md b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Upsample_EN.md index 99f72c13c..da80ae6e7 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Upsample_EN.md +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/doc/Upsample_EN.md @@ -50,7 +50,7 @@ Implemented by using TIK, the Upsample operator is used to scale up the feature - Operator Implementation - For details about the implementation code of the UpsampleTik operator, see [upsample\_tik.py](../tbe/custom_impl/upsample_tik.py). + For details about the implementation code of the UpsampleTik operator, see [upsample\_tik.py](../tbe/impl/upsample_tik.py). - Operator Prototype Definition diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/ide_customops_install.sh b/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/ide_customops_install.sh index 7f1352479..c32d3b1aa 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/ide_customops_install.sh +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/ide_customops_install.sh @@ -4,6 +4,7 @@ targetdir=/usr/local/Ascend/opp target_custom=0 sourcedir=$PWD/packages +vendordir=vendors/$vendor_name log() { cur_date=`date +"%Y-%m-%d %H:%M:%S"` @@ -29,24 +30,49 @@ fi upgrade() { - if [ ! -d ${sourcedir}/$1 ]; then + if [ ! -d ${sourcedir}/$vendordir/$1 ]; then log "[INFO] no need to upgrade ops $1 files" return 0 fi - if [ ! -d ${targetdir}/$1 ];then - log "[INFO] create ${targetdir}/$1." - mkdir -p ${targetdir}/$1 + if [ ! -d ${targetdir}/$vendordir/$1 ];then + log "[INFO] create ${targetdir}/$vendordir/$1." + mkdir -p ${targetdir}/$vendordir/$1 if [ $? -ne 0 ];then - log "[ERROR] create ${targetdir}/$1 failed" + log "[ERROR] create ${targetdir}/$vendordir/$1 failed" return 1 fi else + vendor_installed_dir=$(ls "$targetdir/vendors" 2> /dev/null) + for i in $vendor_installed_dir;do + vendor_installed_file=$(ls "$vendor_installed_dir/$vendor_name/$i" 2> /dev/null) + if [ "$i" = "$vendor_name" ] && [ "$vendor_installed_file" != "" ]; then + echo "[INFO]: $vendor_name custom opp package has been installed on the path $vendor_installed_dir: +- Overlay Installation , please enter:[o] +- Replace directory installation , please enter: [r] +- Do not install , please enter:[n] +>>>" + fi + while true + do + read orn + if [ "$orn" = o ]; then + break + elif [ "$orn" = r ]; then + [ -n "$vendor_installed_file"] && rm -rf "$vendor_installed_file" + break + elif [ "$orn" = n ]; then + return 0 + else + echo "[WARNING]: Input error, please input [o] or [r] or [n] to choose!" + fi + done + done log "[INFO] replace old ops $1 files ......" fi log "copy new ops $1 files ......" - cp -rf ${sourcedir}/$1/* $targetdir/$1/ + cp -rf ${sourcedir}/$vendordir/$1/* $targetdir/$vendordir/$1/ if [ $? -ne 0 ];then log "[ERROR] copy new $1 files failed" return 1 @@ -74,6 +100,14 @@ if [ $? -ne 0 ];then exit 1 fi +config_file=${targetdir}/vendors/config.ini +found_vendors="$(grep -w "load_priority" "$config_file" | cut --only-delimited -d"=" -f2-)" +found_vendor=$(echo $found_vendors | sed "s/$vendor_name//g" | tr ',' ' ') +vendor=$(echo $found_vendor | tr -s ' ' ',') +if [ "$vendor" != "" ]; then + sed -i "/load_priority=$found_vendors/s@load_priority=$found_vendors@load_priority=$vendor_name,$vendor@g" "$config_file" +fi + changemode() { if [ -d ${targetdir} ];then diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/install.sh b/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/install.sh index f198ea99d..28a25d16f 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/install.sh +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/install.sh @@ -4,7 +4,7 @@ targetdir=/usr/local/Ascend/opp target_custom=0 sourcedir=$PWD/packages - +vendordir=vendors/$vendor_name QUIET="n" for i in "$@" @@ -35,23 +35,27 @@ fi upgrade() { - if [ ! -d ${sourcedir}/$1 ]; then + if [ ! -d ${sourcedir}/$vendordir/$1 ]; then log "[INFO] no need to upgrade ops $1 files" return 0 fi - if [ ! -d ${targetdir}/$1 ];then - log "[INFO] create ${targetdir}/$1." - mkdir -p ${targetdir}/$1 + if [ ! -d ${targetdir}/$vendordir/$1 ];then + log "[INFO] create ${targetdir}/$vendordir/$1." + mkdir -p ${targetdir}/$vendordir/$1 if [ $? -ne 0 ];then - log "[ERROR] create ${targetdir}/$1 failed" + log "[ERROR] create ${targetdir}/$vendordir/$1 failed" return 1 fi else has_same_file=-1 - for file_a in ${sourcedir}/$1/*; do + for file_a in ${sourcedir}/$vendordir/$1/*; do file_b=${file_a##*/}; - grep -q $file_b <<<`ls ${targetdir}/$1`; + if [ "ls ${targetdir}/$vendordir/$1" = "" ]; then + log "[INFO] ${targetdir}/$vendordir/$1 is empty !!" + return 1 + fi + grep -q $file_b <<<`ls ${targetdir}/$vendordir/$1`; if [[ $? -eq 0 ]]; then echo -n "${file_b} " has_same_file=0 @@ -59,30 +63,35 @@ upgrade() done if [ 0 -eq $has_same_file ]; then if test $QUIET = "n"; then - echo "has old version in ${targetdir}/$1"\ - "Do you want to replace? [y/n] " - - while true + echo "[INFO]: has old version in ${targetdir}/$vendordir/$1: +- Overlay Installation , please enter:[o] +- Replace directory installation , please enter: [r] +- Do not install , please enter:[n] +>>>" + while true do - read yn - if [ "$yn" = n ]; then + read orn + if [ "$orn" = n ]; then return 0 - elif [ "$yn" = y ]; then + elif [ "$orn" = o ]; then break; + elif [ "$orn" = r ]; then + [ -n "${targetdir}/$vendordir/$1/" ] && rm -rf "${targetdir}/$vendordir/$1"/* + break else echo "[ERROR] input error, please input again!" fi done fi fi - log "[INFO] replace old ops $1 files ......" + log "[INFO] replace or cover ops $1 files .g....." fi log "copy new ops $1 files ......" - if [ -d ${targetdir}/$1/custom/ ]; then - chmod -R +w "$targetdir/$1/custom/" >/dev/null 2>&1 + if [ -d ${targetdir}/$vendordir/$1/ ]; then + chmod -R +w "$targetdir/$vendordir/$1/" >/dev/null 2>&1 fi - cp -rf ${sourcedir}/$1/* $targetdir/$1/ + cp -rf ${sourcedir}/$vendordir/$1/* $targetdir/$vendordir/$1/ if [ $? -ne 0 ];then log "[ERROR] copy new $1 files failed" return 1 @@ -92,22 +101,22 @@ upgrade() } upgrade_proto() { - if [ ! -f ${sourcedir}/custom.proto ]; then + if [ ! -f ${sourcedir}/$vendordir/custom.proto ]; then log "[INFO] no need to upgrade custom.proto files" return 0 fi - if [ ! -d ${targetdir}/framework/custom/caffe ];then - log "[INFO] create ${targetdir}/framework/custom/caffe." - mkdir -p ${targetdir}/framework/custom/caffe + if [ ! -d ${targetdir}/$vendordir/framework/caffe ];then + log "[INFO] create ${targetdir}/$vendordir/framework/caffe." + mkdir -p ${targetdir}/$vendordir/framework/caffe if [ $? -ne 0 ];then - log "[ERROR] create ${targetdir}/framework/custom/caffe failed" + log "[ERROR] create ${targetdir}/$vendordir/framework/caffe failed" return 1 fi else - if [ -f ${targetdir}/framework/custom/caffe/custom.proto ]; then + if [ -f ${targetdir}/$vendordir/framework/caffe/custom.proto ]; then # 有老版本,判断是否要覆盖式安装 if test $QUIET = "n"; then - echo "[INFO] ${targetdir}/framework/custom/caffe has old version"\ + echo "[INFO] ${targetdir}/$vendordir/framework/caffe has old version"\ "custom.proto file. Do you want to replace? [y/n] " while true @@ -122,11 +131,11 @@ upgrade_proto() fi done fi - fi log "[INFO] replace old caffe.proto files ......" + fi fi - chmod -R +w "$targetdir/framework/custom/caffe/" >/dev/null 2>&1 - cp -rf ${sourcedir}/custom.proto ${targetdir}/framework/custom/caffe/ + chmod -R +w "$targetdir/$vendordir/framework/caffe/" >/dev/null 2>&1 + cp -rf ${sourcedir}/$vendordir/custom.proto ${targetdir}/$vendordir/framework/caffe/ if [ $? -ne 0 ];then log "[ERROR] copy new custom.proto failed" return 1 @@ -161,6 +170,14 @@ if [ $? -ne 0 ];then exit 1 fi +config_file=${targetdir}/vendors/config.ini +found_vendors="$(grep -w "load_priority" "$config_file" | cut --only-delimited -d"=" -f2-)" +found_vendor=$(echo $found_vendors | sed "s/$vendor_name//g" | tr ',' ' ') +vendor=$(echo $found_vendor | tr -s ' ' ',') +if [ "$vendor" != "" ]; then + sed -i "/load_priority=$found_vendors/s@load_priority=$found_vendors@load_priority=$vendor_name,$vendor@g" "$config_file" +fi + changemode() { if [ -d ${targetdir} ];then @@ -180,8 +197,8 @@ if [ $? -ne 0 ];then exit 1 fi if [ `id -u` == 0 ]; then - if [ -d ${targetdir}/op_impl/custom/cpu/aicpu_kernel/custom_impl/ ]; then - chmod -R 440 ${targetdir}/op_impl/custom/cpu/aicpu_kernel/custom_impl/* >/dev/null 2>&1 + if [ -d ${targetdir}/$vendordir/op_impl/cpu/aicpu_kernel/impl/ ]; then + chmod -R 440 ${targetdir}/$vendordir/op_impl/cpu/aicpu_kernel/impl/* >/dev/null 2>&1 fi if [ -f ${targetdir}/ascend_install.info ]; then chmod -R 440 ${targetdir}/ascend_install.info diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/testcase_300.sh b/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/testcase_300.sh index 6ce3cefe0..70fd94adc 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/testcase_300.sh +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/testcase_300.sh @@ -88,4 +88,4 @@ function main() { return ${success} } -main \ No newline at end of file +main diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/upgrade.sh b/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/upgrade.sh index 941a61b59..d2f8d1e1f 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/upgrade.sh +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/scripts/upgrade.sh @@ -4,6 +4,7 @@ targetdir=/usr/local/Ascend/opp target_custom=0 sourcedir=$PWD/packages +vendordir=vendors/$vendor_name log() { cur_date=`date +"%Y-%m-%d %H:%M:%S"` @@ -24,24 +25,49 @@ fi upgrade() { - if [ ! -d ${sourcedir}/$1 ]; then + if [ ! -d ${sourcedir}/$vendordir/$1 ]; then log "[INFO] no need to upgrade ops $1 files" return 0 fi - if [ ! -d ${targetdir}/$1 ];then - log "[INFO] create ${targetdir}/$1." - mkdir -p ${targetdir}/$1 + if [ ! -d ${targetdir}/$vendordir/$1 ];then + log "[INFO] create ${targetdir}/$vendordir/$1." + mkdir -p ${targetdir}/$vendordir/$1 if [ $? -ne 0 ];then - log "[ERROR] create ${targetdir}/$1 failed" + log "[ERROR] create ${targetdir}/$vendordir/$1 failed" return 1 fi else + vendor_installed_dir=$(ls "$targetdir/vendors" 2> /dev/null) + for i in $vendor_installed_dir;do + vendor_installed_file=$(ls "$vendor_installed_dir/$vendor_name/$i" 2> /dev/null) + if [ "$i" = "$vendor_name" ] && [ "$vendor_installed_file" != "" ]; then + echo "[INFO]: $vendor_name custom opp package has been installed on the path $vendor_installed_dir, \ +- Overlay Installation , please enter:[o] +- Replace directory installation , please enter: [r] +- Do not install , please enter:[n] +>>>" + fi + while true + do + read orn + if [ "$orn" = o ]; then + break + elif [ "$orn" = r ]; then + [ -n "$vendor_installed_file"] && rm -rf "$vendor_installed_file" + break + elif [ "$orn" = n ]; then + return 0 + else + echo "[WARNING]: Input error, please input m or r or n to choose!" + fi + done + done log "[INFO] replace old ops $1 files ......" fi log "copy new ops $1 files ......" - cp -rf ${sourcedir}/$1/* $targetdir/$1/ + cp -rf ${sourcedir}/$vendordir/$1/* $targetdir/$vendordir/$1/ if [ $? -ne 0 ];then log "[ERROR] copy new $1 files failed" return 1 @@ -69,6 +95,14 @@ if [ $? -ne 0 ];then exit 1 fi +config_file=${targetdir}/vendors/config.ini +found_vendors="$(grep -w "load_priority" "$config_file" | cut --only-delimited -d"=" -f2-)" +found_vendor=$(echo $found_vendors | sed "s/$vendor_name//g" | tr ',' ' ') +vendor=$(echo $found_vendor | tr -s ' ' ',') +if [ "$vendor" != "" ]; then + sed -i "/load_priority=$found_vendors/s@load_priority=$found_vendors@load_priority=$vendor_name,$vendor@g" "$config_file" +fi + changemode() { if [ -d ${targetdir} ];then diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/__init__.py b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/add_dsl.py b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/add_dsl.py new file mode 100644 index 000000000..8f28d0dc3 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/add_dsl.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +""" +Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +This program is free software; you can redistribute it and/or modify +it under the terms of the Apache License Version 2.0.You may not use this file +except in compliance with the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +Apache License for more details at +http://www.apache.org/licenses/LICENSE-2.0 + +add +""" +from __future__ import absolute_import + +import tbe.dsl as tbe +from functools import reduce +from tbe import tvm +from tbe.common.register import register_op_compute +from tbe.common.utils import para_check +from tbe.common.utils import shape_util + +# General limitation of the reduce size for input shape: 2**31 +SHAPE_SIZE_LIMIT = 2147483648 + + +# pylint: disable=locally-disabled,too-many-arguments,unused-argument +@register_op_compute("Add", op_mode="dynamic", support_fusion=True) +def add_compute(input_x, input_y, output_z, kernel_name="add"): + """ + calculating data's add, c = a + b + + Parameters + ---------- + input_x: TVM tensor + the placeholder of first input data + input_y: TVM tensor + the placeholder of second input data + output_data: dict + shape and dtype of output, should be broadcast shape and type as input + kernel_name: str + cce kernel name, default value is add + + Returns + ------- + res : output of the data's add + """ + shape_x = shape_util.shape_to_list(input_x.shape) + shape_y = shape_util.shape_to_list(input_y.shape) + + shape_x, shape_y, shape_max = shape_util.broadcast_shapes(shape_x, shape_y, + param_name_input1="input_x", + param_name_input2="input_y") + shape_size = reduce(lambda x, y: x * y, shape_max[:]) + if shape_size > SHAPE_SIZE_LIMIT: + raise RuntimeError("the shape is too large to calculate") + + input_x = tbe.broadcast(input_x, shape_max) + input_y = tbe.broadcast(input_y, shape_max) + res = tbe.vadd(input_x, input_y) + + return res + + +@para_check.check_op_params(para_check.REQUIRED_INPUT, para_check.REQUIRED_INPUT, + para_check.REQUIRED_OUTPUT, para_check.KERNEL_NAME) +def add_dsl(input_x, input_y, output_z, kernel_name="add_dsl"): + """ + algorithm: add + calculating data's add, c = a + b + + Parameters + ---------- + input_x : dict + shape and dtype of first input, only support float16, float32, int32 + input_y : dict + shape and dtype of second input, only support float16, float32, int32 + output_z: dict + shape and dtype of output, should be broadcast shape and type as input + kernel_name : str + cce kernel name, default value is add + + Returns + ------- + None + """ + shape_x = input_x.get("shape") + shape_y = input_y.get("shape") + + check_tuple = ("float16", "float32", "int32") + input_data_type = input_x.get("dtype").lower() + para_check.check_dtype(input_data_type, check_tuple, param_name="input_x") + + shape_x, shape_y, shape_max = shape_util.broadcast_shapes(shape_x, shape_y, + param_name_input1="input_x", + param_name_input2="input_y") + + if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1: + shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1] + shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1] + shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1] + + data_x = tvm.placeholder(shape_x, name="data_1", dtype=input_data_type) + data_y = tvm.placeholder(shape_y, name="data_2", dtype=input_data_type) + + res = add_compute(data_x, data_y, output_z, kernel_name) + + with tvm.target.cce(): + schedule = tbe.auto_schedule(res) + + config = {"name": kernel_name, + "tensor_list": (data_x, data_y, res)} + tbe.build(schedule, config) diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/conv2d_tik.py b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/conv2d_tik.py new file mode 100644 index 000000000..3ee99eba5 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/conv2d_tik.py @@ -0,0 +1,171 @@ +""" +Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +This program is free software; you can redistribute it and/or modify +it under the terms of the Apache License Version 2.0.You may not use this file +except in compliance with the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +Apache License for more details at +http://www.apache.org/licenses/LICENSE-2.0 + +conv2d_tik +""" +from __future__ import absolute_import +import numpy as np +from tbe import tik +from tbe.common.platform import get_soc_spec + +DTYPE_SIZE = { + 'bool': 1, + 'uint8': 1, + 'int8': 1, + 'uint16': 2, + 'int16': 2, + 'int24': 3, + 'uint32': 4, + 'int32': 4, + 'float16': 2, + 'float32': 4, + 'int48': 6, + 'int64': 8, + 'uint64': 8, + 'float64':8 +} + + +def conv2d_tik_compute(params): + """ + conv2d tik compute + @param params: conv2d data + @return: tik instance + """ + tik_instance = tik.Tik() + + # get shape of feature map and weight + n, c1, h, w, c0 = params["fm_shape"] + c1, kh, kw, cout, c0 = params["weight_shape"] + + # get value of stride, dilation, pad + stride_h, stride_w = params["stride_list"] + dilation_h, dilation_w = params["dilation_list"] + pad_top, pad_bot, pad_left, pad_right = params["pad_list"] + + # calculate height and width + kh_dilation = (kh - 1) * dilation_h + 1 + kw_dilation = (kw - 1) * dilation_w + 1 + ho = int(np.ceil((h + pad_top + pad_bot - kh_dilation + 1) / stride_h)) + wo = int(np.ceil((w + pad_right + pad_left - kw_dilation + 1) / stride_w)) + round_howo = ((ho * wo + 16 - 1) // 16) * 16 + + fm_gm = tik_instance.Tensor(params['fm_dtype'], (n, c1, h, w, c0), + name='fm_gm', scope=tik.scope_gm) + weight_gm = tik_instance.Tensor(params['weight_type'], + (c1, kh, kw, cout, c0), name='weight_gm', + scope=tik.scope_gm) + dst_gm = tik_instance.Tensor(params['dst_gm_type'], + [n, cout // 16, ho, wo, 16], + name='dst_gm', scope=tik.scope_gm) + + core_num = params['core_num'] + pre_core_cout = cout // core_num + cout_iter_num = pre_core_cout // params["cout_split_factor"] + Cin_blocks = c1 + + with tik_instance.for_range(0, core_num, block_num=core_num) as cout_o: + with tik_instance.for_range(0, cout_iter_num, thread_num=1) as cout_i: + weight_L1 = tik_instance.Tensor( + params['weight_type'], (Cin_blocks, kh, kw, + params["cout_split_factor"], c0), + name='weight_l1', scope=tik.scope_cbuf) + tik_instance.data_move( + weight_L1, + weight_gm.flatten()[cout_o * pre_core_cout * c0 + + params["cout_split_factor"] * cout_i * c0], + 0, Cin_blocks * kh * kw, + params["cout_split_factor"], + (cout - params["cout_split_factor"]), 0) + + with tik_instance.for_range(0, n, thread_num=2) as n_index: + feature_map_l1 = tik_instance.Tensor(params['fm_dtype'], + (c1, h, w, c0), + name='feature_map_l1', + scope=tik.scope_cbuf) + tik_instance.data_move(feature_map_l1, + fm_gm[n_index, :, :, :, :], + 0, 1, c1 * h * w, 0, 0) + dst_l0c = tik_instance.Tensor( + params['dst_l0c_type'], [params["cout_split_factor"] // 16, + round_howo, 16], + name='dst_l0c', scope=tik.scope_cbuf_out) + + tik_instance.conv2d(dst_l0c, feature_map_l1, + weight_L1, (c1, h, w, c0), + (Cin_blocks, kh, kw, + params["cout_split_factor"], c0), + params['stride_list'], + [pad_left, pad_right, pad_top, pad_bot], + params['dilation_list'], + params['pad_value']) + + tik_instance.fixpipe( + dst_gm[n_index, (cout_o * pre_core_cout + params["cout_split_factor"] * cout_i) // + (32 // DTYPE_SIZE[params['dst_gm_type']]), 0, 0, 0], + dst_l0c, params["cout_split_factor"] // 16, + ho * wo * 16 * DTYPE_SIZE[params['dst_l0c_type']] // 32, 0, 0, + extend_params={"bias": None, + "quantize_params": params["quantize_params"]}) + + tik_instance.BuildCCE(kernel_name=params["kernel_name"], inputs=[fm_gm, weight_gm], outputs=[dst_gm]) + + return tik_instance + + +def conv2d_tik(inputs, weights, outputs, strides, pads, dilations, kernel_name="conv2d_tik"): + in_dtype = inputs.get("dtype") + w_dtype = weights.get("dtype") + res_dtype = outputs.get("dtype") + in_shape = inputs.get("shape") + w_shape = weights.get("ori_shape") + + if len(strides) != 4: + raise RuntimeError("strides shape should be 4d.") + if len(dilations) != 4: + raise RuntimeError("dilations shape should be 4d.") + if len(pads) != 4: + raise RuntimeError("pads shape should be 4d.") + if in_dtype != "float16" or w_dtype != "float16" or res_dtype != "float16": + raise RuntimeError("dtype shape should be float16.") + if weights.get("ori_format") != "NCHW": + raise RuntimeError("format should be NCHW.") + + if get_soc_spec("SOC_VERSION") in ["SD3403", "OPTG", "Hi3796CV300CS", "TsnsC"]: + loc_dtype = "float16" + quantize_params = None + else: + loc_dtype = "float32" + quantize_params = {"mode": "fp322fp16", "mode_param": None} + + stride_list = [strides[2], strides[3]] + dilation_list = [dilations[2], dilations[3]] + w5hd_shape = [w_shape[1] // 16, w_shape[2], w_shape[3], w_shape[0], 16] + + params = { + "fm_shape": in_shape, + "weight_shape": w5hd_shape, + "fm_dtype": in_dtype, + "weight_type": w_dtype, + "dst_l0c_type": loc_dtype, + "dst_gm_type": res_dtype, + "quantize_params": quantize_params, + "pad_list": pads, + "pad_value": 0, + "stride_list": stride_list, + "dilation_list": dilation_list, + "cout_split_factor": 64, + "core_num": 2, + "kernel_name": kernel_name} + + conv2d_tik_compute(params) diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/leaky_relu_demo.py b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/leaky_relu_demo.py new file mode 100644 index 000000000..0c070607a --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/leaky_relu_demo.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +""" +Copyright (C) 2018. Huawei Technologies Co., Ltd. All rights reserved. + +This program is free software; you can redistribute it and/or modify +it under the terms of the Apache License Version 2.0.You may not use this +file except in compliance with the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +Apache License for more details at +http://www.apache.org/licenses/LICENSE-2.0 + +cce extended operator builder wrapper +""" + +import tbe.dsl as tbe +from tbe import tvm +from tbe.common.register import register_op_compute +from tbe.common.utils import para_check +from tbe.common.utils import shape_util + +# pylint: disable=locally-disabled,unused-argument,invalid-name +@register_op_compute("LeakyReluDemo", op_mode="dynamic", support_fusion=True) +def leaky_relu_demo_compute(x, y, negative_slope=0, kernel_name="leaky_relu"): + """ + compute for caffe_relu_layer_cce + """ + inp_dtype = x.dtype.lower() + shape = x.shape + + # The original relu logic remains unchanged. + if negative_slope == 0: + if inp_dtype in ("float32", "int32"): + tensor_zero = tbe.broadcast(tvm.const(0, inp_dtype), shape) + data_res = tbe.vmax(x, tensor_zero) + else: + data_res = tbe.vrelu(x) + + data_res = tbe.cast_to(data_res, inp_dtype) + + return data_res + # negative_slope != 0 + if inp_dtype in ("float16", "float32"): + slope_tmp = tvm.const(negative_slope, dtype=inp_dtype) + tmp = tbe.vmuls(x, slope_tmp) + if negative_slope <= 1: + res = tbe.vmax(x, tmp) + else: + res = tbe.vmin(x, tmp) + else: + # inp_dtype in ("int32", "int8") + slope_tmp = tvm.const(negative_slope, dtype=inp_dtype) + tmp = tbe.vmuls(x, slope_tmp) + tmp_oritype = tbe.cast_to(tmp, inp_dtype) + if negative_slope <= 1: + res = tbe.vmax(x, tmp_oritype) + else: + res = tbe.vmin(x, tmp_oritype) + + res = tbe.cast_to(res, inp_dtype) + + return res + + +@para_check.check_op_params(para_check.REQUIRED_INPUT, para_check.REQUIRED_OUTPUT, + para_check.OPTION_ATTR_FLOAT, para_check.KERNEL_NAME) +def leaky_relu_demo(x, y, negative_slope=0, kernel_name="leaky_relu"): + """leaky_relu op for input tensor + + f(x)= x(x>=0) or negative_slope*x(x<0) equal to + f(x)=negative_slope*x + + Parameters + ---------- + x : TVM tensor + input tensor has shape and dtype attributes + y : dict + dict with keys(shape and dtype) of output + + negative_slope : float or int + allow non-zero slope for negative inputs to speed up optimization + + kernel_name : str + cce kernel name, default value is "leaky_relu" + + Returns + ------ + None + """ + + # check input tensor shape + shape = x.get("shape") + dtype = x.get("dtype") + + # check input tensor data_type + check_list = ["float16", "float32", "int32", "int8"] + para_check.check_dtype(dtype.lower(), check_list, param_name="x") + + inp_dtype = dtype.lower() + input_data_x = tvm.placeholder(shape, name="input_data_x", dtype=inp_dtype) + + with tvm.target.cce(): + + res = leaky_relu_demo_compute(input_data_x, y, negative_slope, kernel_name) + sch = tbe.auto_schedule(res) + + config = {"name": kernel_name, + "tensor_list": [input_data_x, res]} + tbe.build(sch, config) diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/matmul_tik.py b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/matmul_tik.py new file mode 100644 index 000000000..d374832e8 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/matmul_tik.py @@ -0,0 +1,210 @@ +""" +Copyright 2020 Huawei Technologies Co., Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +matmul_tik +""" + +from tbe import tik +from tbe.common.platform import get_soc_spec + +DTYPE_SIZE = { + 'bool': 1, + 'uint8': 1, + 'int8': 1, + 'uint16': 2, + 'int16': 2, + 'int24': 3, + 'uint32': 4, + 'int32': 4, + 'float16': 2, + 'float32': 4, + 'int48': 6, + 'int64': 8, + 'uint64': 8, + 'float64':8 +} + + +def MK_TO_K1MK0(tik_instance, mk_input_tensor, k1mk0_tensor, dtype, k1, m, k0): + """data move mk to k1mk0""" + src_ub = tik_instance.Tensor(dtype, (k1, m, k0), name='src_ub', scope=tik.scope_ubuf) + + # data_move(m, k) ---> (k1, m, k0) + with tik_instance.for_range(0, k1) as i: + tik_instance.data_move(src_ub[i * m * k0:], mk_input_tensor[i * k0:], 0, m, k0 * DTYPE_SIZE[dtype] // 32, + (k1 - 1) * k0 * DTYPE_SIZE[dtype] // 32, 0) + + tik_instance.data_move(k1mk0_tensor, src_ub, 0, 1, k1 * m * k0 * DTYPE_SIZE[dtype] // 32, 0, 0) + + +def KN_TO_K1NK0(tik_instance, kn_input_tensor, k1nk0_tensor, dtype, k1, n, k0): + """data move kn to k1nk0""" + + with tik_instance.for_range(0, k1) as index: + k1nk0_ub = tik_instance.Tensor(dtype, (n, k0), tik.scope_ubuf, "k1nk0_ub") + src_ub = tik_instance.Tensor(dtype, (k0, n), tik.scope_ubuf, "src_ub") + burst_len = k0 * n * DTYPE_SIZE[dtype] // 32 + tik_instance.data_move(src_ub, kn_input_tensor[index * k0 * n], 0, 1, burst_len, 0, 0) + dst_list = [k1nk0_ub[16 * i] for i in range(16)] + src_list = [src_ub[n * i] for i in range(16)] + rep_times = n // k0 + dst_rep_stride = k0 + src_rep_stride = 1 + tik_instance.vec_trans_scatter(False, False, dst_list, src_list, rep_times, dst_rep_stride, src_rep_stride) + tik_instance.data_move(k1nk0_tensor[index * k0 * n], k1nk0_ub, 0, 1, burst_len, 0, 0) + + +def N1MN0_TO_MN(tik_instance, mn_output_tensor, n1mn0_tensor, dtype, n1, m, n0): + """data move mn to n1mn0""" + src_ub = tik_instance.Tensor(dtype, (m, n1 * n0), name='src_ub', scope=tik.scope_ubuf) + + # data_move(n1, m, n0) ---> (m, n) + with tik_instance.for_range(0, n1) as i: + tik_instance.data_move(src_ub[i * n0:], n1mn0_tensor[i * m * n0:], 0, m, + n0 * DTYPE_SIZE[dtype] // 32, 0, (n1 - 1) * n0 * DTYPE_SIZE[dtype] // 32) + + tik_instance.data_move(mn_output_tensor, src_ub, 0, 1, m * n1 * n0 * DTYPE_SIZE[dtype] // 32, 0, 0) + + +def matmul_tik_compute(params, kernel_name): + """ + matmul tik compute + @param params: matmul data + @param kernel_name: kernel name + @return: tik instance + """ + tik_instance = tik.Tik() + if not isinstance(params, dict): + params = params.__dict__ + m_size, k_size, n_size = params['M'], params['K'], params['N'] + data_type = params["data_type"] + m_tiling_size = int(params["m_tiling_size"]) + n_tiling_size = int(params["n_tiling_size"]) + k_tiling_size = int(params['k_tiling_size']) + + m_cycle_times = params["m_cycle_times"] + n_cycle_times = params["n_cycle_times"] + k_cycle_times = params["k_cycle_times"] + + # Determine the output type + if data_type == "float16": + if get_soc_spec("SOC_VERSION") in ["SD3403", "OPTG", "Hi3796CV300CS", "TsnsC"]: + C_loc_out_type = "float16" + else: + C_loc_out_type = "float32" + K0 = 16 + else: + C_loc_out_type = "int32" + K0 = 32 + block_size = 16 + + n_thread_num = params['n_thread_num'] + m_thread_num = params['m_thread_num'] + k_thread_num = params['k_thread_num'] + + mk_gm_input = tik_instance.Tensor(data_type, (m_size, k_size), name="mk_input_gm", scope=tik.scope_gm) + kn_gm_input = tik_instance.Tensor(data_type, (k_size, n_size), name="kn_input_gm", scope=tik.scope_gm) + + k1mk0_workspace = tik_instance.Tensor(data_type, (k_size // K0, m_size, K0), name="k1mk0_workspace", + scope=tik.scope_gm, is_workspace=True) + + k1nk0_workspace = tik_instance.Tensor(data_type, (k_size // K0, n_size, K0), name="k1nk0_workspace", + scope=tik.scope_gm, is_workspace=True) + + mn_gm_output = tik_instance.Tensor(C_loc_out_type, (m_size, n_size), tik.scope_gm, name="mn_output_gm") + nmk0_workspace = tik_instance.Tensor(C_loc_out_type, (n_size // block_size, m_size, block_size), + name="nmk0_workspace", scope=tik.scope_gm, is_workspace=True) + + MK_TO_K1MK0(tik_instance, mk_gm_input, k1mk0_workspace, data_type, k_size // K0, m_size, K0) + KN_TO_K1NK0(tik_instance, kn_gm_input, k1nk0_workspace, data_type, k_size // K0, n_size, K0) + + # Tiling is realized through the for_range() loop. + with tik_instance.for_range(0, 2, block_num = 1) as core_id: + with tik_instance.for_range(0, n_cycle_times // 2, thread_num=n_thread_num) as n_idx: + with tik_instance.for_range(0, m_cycle_times, thread_num=m_thread_num) as m_idx: + dst_l0c = tik_instance.Tensor(C_loc_out_type, [n_tiling_size // 16, m_tiling_size, 16], name='dst_l0c', + scope=tik.scope_cbuf_out) + with tik_instance.for_range(0, k_cycle_times, + thread_num=k_thread_num) as k_idx: + # Calculation result data transfer. + inputa_l1 = tik_instance.Tensor(params['data_type'], [k_tiling_size // K0, m_tiling_size, K0], + name="A_tiling_l1", scope=tik.scope_cbuf) + tik_instance.data_move(inputa_l1, + k1mk0_workspace[k_idx * k_tiling_size // K0, m_idx * m_tiling_size, :], + 0, k_tiling_size // K0, m_tiling_size, m_size - m_tiling_size, 0) + inputb_l1 = tik_instance.Tensor(params["data_type"], [k_tiling_size // K0, n_tiling_size, K0], + name="B_tiling_l1", scope=tik.scope_cbuf) + if n_size - n_tiling_size > 65535: + with tik_instance.for_range(0, k_tiling_size // K0) \ + as dma_k_idx: + tik_instance.data_move(inputb_l1[dma_k_idx, :, :], + k1nk0_workspace[k_idx * k_tiling_size // K0 + dma_k_idx, + (core_id * n_cycle_times // 2 + n_idx) * n_tiling_size, :], + 0, 1, n_tiling_size, 0, 0) + else: + tik_instance.data_move(inputb_l1, k1nk0_workspace[k_idx * k_tiling_size // K0, + (core_id * n_cycle_times // 2 + n_idx) * n_tiling_size, :], + 0, k_tiling_size // K0, n_tiling_size, n_size - n_tiling_size, 0) + # Call matmul API to matrix multiplication calculation. + with tik_instance.if_scope(k_idx == 0): + tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size, + init_l1out=True) + with tik_instance.else_scope(): + tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size, + init_l1out=False) + tik_instance.fixpipe(nmk0_workspace[n_tiling_size // 16 * (core_id * n_cycle_times // 2 + n_idx), + m_idx * m_tiling_size, :], dst_l0c, n_tiling_size // 16, m_tiling_size * 16 * + DTYPE_SIZE[C_loc_out_type]//32, + (m_size - m_tiling_size) * 16 * DTYPE_SIZE[C_loc_out_type] // 32, 0) + + N1MN0_TO_MN(tik_instance, mn_gm_output, nmk0_workspace, C_loc_out_type, n_size // K0, m_size, K0) + + tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[mk_gm_input, kn_gm_input], outputs=[mn_gm_output]) + return tik_instance + + +def matmul_tik(input_x1, input_x2, output_y=None, kernel_name="simple_matmul"): + """ + matmul_tik main func + Parameters + ---------- + input_x1: input data 1 + input_x2: input data 2 + output_y: output dta + """ + shape_a = input_x1.get("ori_shape") + shape_b = input_x2.get("ori_shape") + output_y = output_y + m = shape_a[0] + k = shape_a[1] + n = shape_b[1] + data_type = input_x1.get("dtype").lower() + params = { + 'M': m, + 'K': k, + 'N': n, + 'data_type': data_type, + 'm_tiling_size': 16, + 'm_cycle_times': 1, + 'm_thread_num': 1, + 'n_tiling_size': 64, + 'n_cycle_times': 16, + 'n_thread_num': 1, + 'k_tiling_size': 32, + 'k_cycle_times': 2, + 'k_thread_num': 2, + 'output_y':output_y + } + return matmul_tik_compute(params, kernel_name) diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/permute_tik.py b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/permute_tik.py new file mode 100644 index 000000000..a7daab746 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/permute_tik.py @@ -0,0 +1,587 @@ +""" +Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. + +This program is free software; you can redistribute it and/or modify +it under the terms of the Apache License Version 2.0.You may not use +this file except in compliance with the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +Apache License for more details at +http://www.apache.org/licenses/LICENSE-2.0 + +permute_tik +""" + +from tbe import tik +from tbe.common.platform import platform_info +from tbe.common.utils import para_check + +# available ub size +UB_SIZE_B = platform_info.get_soc_spec(platform_info.UB_SIZE) +# available number of cores +AICORE_NUM = platform_info.get_soc_spec(platform_info.CORE_NUM) + + +# pylint: disable=invalid-name,too-many-locals,too-many-arguments +@para_check.check_op_params(para_check.REQUIRED_INPUT, para_check.REQUIRED_OUTPUT, + para_check.OPTION_ATTR_LIST_INT, para_check.KERNEL_NAME) +def permute_tik(x, y, order=(0), kernel_name="permute_tik"): + """ + only support nchw->nhwc + + Parameters + ---------- + x : dict + shape and dtype of input + y : dict + shape and dtype of output, should be same shape and type as input + order: tuple, list + axis transformation order + kernel_name : str + kernel name, default value is "permute_tik" + + Returns + ------- + None + """ + shape = x.get("shape") + dtype = y.get("dtype") + input_dtype = dtype.lower() + supported_dtype = ["float16"] + input_format = x.get("format") + check_pass = False + if input_format == 'NCHW': + if len(order) == 4 and order[0] == 0 \ + and order[1] == 2 and order[2] == 3 and order[3] == 1: + check_pass = True + if not check_pass: + raise RuntimeError("only support nchw->nhwc") + para_check.check_dtype_rule(input_dtype, supported_dtype) + para_check.check_dtype_rule(dtype, supported_dtype) + para_check.check_shape_rule(shape) + para_check.check_shape_size(shape) + para_check.check_kernel_name(kernel_name) + + input_dict = { + "x": x, + "y": y, + "order": order + } + permute_process = Permute(input_dict) + permute_process.permute_compute() + permute_process.instance.BuildCCE(kernel_name=kernel_name, + inputs=permute_process.x_gm, + outputs=permute_process.y_gm) + + return permute_process.instance + + +def get_shape_size(shape): + """ + get the number of element from the shape + + Parameters + ---------- + shape: output shape + + Returns + ------- + total_number: the number of element of the shape + """ + total_number = 1 + for val in shape: + total_number = total_number * val + + return total_number + + +def get_block_num_and_loop_cycle(shape): + """ + get block dim and loop cycle + + Parameters + ---------- + shape: input shape + + Returns + ------- + block_num: the number of cores + inner_loop: the number of cycles per core + inner_loop_mod: the number of remaining cycles + thread_num: whether to enable double buffer 1:false 2:true + """ + batch, col_len, row_len = shape + size = batch * col_len * row_len + block_num = AICORE_NUM + inner_loop = 1 + inner_loop_mod = 0 + thread_num = 1 + + if size <= 16: + block_num = 1 + return block_num, inner_loop, inner_loop_mod, thread_num + + all_block_num = shape[0] + if col_len * row_len >= 16: + if all_block_num < AICORE_NUM: + block_num = all_block_num + else: + chw = col_len * row_len + num = (16 + chw) // chw + if batch // num < AICORE_NUM: + block_num = batch // num + inner_loop = all_block_num // block_num + inner_loop_mod = all_block_num % block_num + if inner_loop > 1: + thread_num = 2 + return block_num, inner_loop, inner_loop_mod, thread_num + + +class Permute: + """ + Function: store permute parameters and compute permute + """ + + def __init__(self, input_dict): + """ + init the permute parameters + """ + self.instance = tik.Tik() + self.dtype = input_dict.get("x").get("dtype").lower() + self.dsize = 2 + size = get_shape_size(input_dict.get("x").get("shape")) + self.x_gm = self.instance.Tensor(self.dtype, (size,), name="x_gm", + scope=tik.scope_gm) + self.y_gm = self.instance.Tensor(self.dtype, (size,), name="y_gm", + scope=tik.scope_gm) + ub_size = (UB_SIZE_B - 1024) // 4 // self.dsize // 256 * 256 + self.ub_size = ub_size + self.input_dict = input_dict + + def get_shape_info(self): + """ + determine whether to convert the shape based on the input shape + """ + shape = self.input_dict.get("x").get("shape") + if shape[1] == 1 or shape[2] * shape[3] == 1: + shape_size = get_shape_size(shape) + shape_new = [shape_size] + order_new = [0] + shape_out_new = [shape_size] + else: + n_i = shape[0] + col_len = shape[1] + row_len = shape[2] * shape[3] + shape_new = [n_i, col_len, row_len] + order_new = [0, 2, 1] + shape_out_new = [] + for i in order_new: + shape_out_new.append(shape_new[i]) + return shape_new, order_new, shape_out_new + + def move_without_transform(self, shape): + """ + when C = 1 or H*W = 1, directly move data in and out + """ + ub_size = (UB_SIZE_B - 1024) // 2 // self.dsize // 16 * 16 + if shape[0] <= 16: + block_num = 1 + else: + all_block_num = shape[0] // 16 + block_num = AICORE_NUM + if all_block_num < AICORE_NUM: + block_num = all_block_num + each_len = shape[0] // block_num + each_mod = shape[0] % block_num + thread_num = 1 + if each_len // ub_size > 1: + thread_num = 2 + + with self.instance.for_range(0, block_num, block_num=block_num) \ + as block_id: + each_size = self.instance.Scalar("int32") + each_size.set_as(each_len) + with self.instance.if_scope(block_id == block_num - 1): + each_size.set_as(each_len + each_mod) + ub_loop = each_size // ub_size + ub_mod = each_size % ub_size + with self.instance.for_range(0, + ub_loop, + thread_num=thread_num) as loop_id: + src_ub = self.instance.Tensor(self.dtype, (ub_size,), + name="src_ub", + scope=tik.scope_ubuf) + burst_len = ub_size // 16 + self.instance.data_move( + src_ub, + self.x_gm[each_len * block_id + loop_id * ub_size], + 0, 1, burst_len, 0, 0) + self.instance.data_move( + self.y_gm[each_len * block_id + loop_id * ub_size], + src_ub, + 0, 1, burst_len, 0, 0) + with self.instance.if_scope(ub_mod > 0): + src_ub = self.instance.Tensor(self.dtype, (ub_size,), + name="src_ub", + scope=tik.scope_ubuf) + with self.instance.if_scope( + tik.all(block_num > 1, ub_mod % 16 != 0)): + src_ub_1 = self.instance.Tensor(self.dtype, (16,), + name="src_ub_1", + scope=tik.scope_ubuf) + index = each_len * block_id + ub_loop * ub_size + with self.instance.if_scope(ub_mod >= 16): + burst_len = ub_mod // 16 + self.instance.data_move(src_ub, + self.x_gm[index], + 0, 1, burst_len, 0, 0) + self.instance.data_move(self.y_gm[index], + src_ub, + 0, 1, burst_len, 0, 0) + offset = index + burst_len * 16 - 16 + ub_mod % 16 + self.instance.data_move(src_ub_1, + self.x_gm[offset], + 0, 1, 1, 0, 0) + self.instance.data_move(self.y_gm[offset], + src_ub_1, + 0, 1, 1, 0, 0) + with self.instance.else_scope(): + offset = index - 16 + ub_mod % 16 + self.instance.data_move(src_ub_1, + self.x_gm[offset], + 0, 1, 1, 0, 0) + self.instance.data_move(self.y_gm[offset], + src_ub_1, + 0, 1, 1, 0, 0) + with self.instance.else_scope(): + burst_len = (ub_mod + 15) // 16 + self.instance.data_move( + src_ub, + self.x_gm[ + each_len * block_id + ub_loop * ub_size], + 0, 1, burst_len, 0, 0) + self.instance.data_move( + self.y_gm[ + each_len * block_id + ub_loop * ub_size], + src_ub, + 0, 1, burst_len, 0, 0) + + def trans_scatter(self, col_len_ub, row_len_ub, src_ub, dst_ub): + """ + transposes the data + """ + c_zu = col_len_ub // 16 + r_zu = row_len_ub // 16 + with self.instance.for_range(0, r_zu) as num_r: + repeat = c_zu + src_stride = 0 + dst_stride = 0 + if repeat != 1: + src_stride = 16 * r_zu + dst_stride = 1 + dst_list = [dst_ub[16 * col_len_ub * num_r + 16 * c_zu * i] for i in range(16)] + src_list = [src_ub[16 * num_r + 16 * r_zu * j] for j in range(16)] + self.instance.vec_trans_scatter(False, False, dst_list, + src_list, repeat, + dst_stride, src_stride) + + def move_gm_to_ub(self, row_len, col_len_ub, row_len_ub, src_ub, index): + """ + move data from gm to ub + """ + stride = (row_len - row_len_ub) // 16 + row_len_ub_align = (row_len_ub + 15) // 16 * 16 + if row_len % 16 == 0 and stride < 65535: + n_burst = col_len_ub + burst_len = row_len_ub_align // 16 + self.instance.data_move(src_ub, + self.x_gm[index], + 0, n_burst, burst_len, stride, 0) + else: + with self.instance.for_range(0, col_len_ub) as c_i: + burst_len = row_len_ub_align // 16 + self.instance.data_move( + src_ub[c_i * row_len_ub_align], + self.x_gm[index + c_i * row_len], + 0, 1, burst_len, 0, 0) + + def move_ub_to_gm(self, col_len, col_len_ub, row_len_ub, index, dst_ub): + """ + move data from ub to gm when c >= 16 + """ + stride = (col_len - col_len_ub) // 16 + if col_len % 16 == 0 and stride < 65535: + n_burst = row_len_ub + burst_len = col_len_ub // 16 + self.instance.data_move(self.y_gm[index], + dst_ub, + 0, n_burst, burst_len, 0, stride) + else: + with self.instance.for_range(0, row_len_ub) as r_i: + burst_len = col_len_ub // 16 + self.instance.data_move( + self.y_gm[index + r_i * col_len], + dst_ub[r_i * col_len_ub], + 0, 1, burst_len, 0, 0) + + def move_ub_to_gm_with_tail(self, input_dict): + """ + move data from ub to gm when c < 16 + """ + shape = input_dict.get("shape") + dst_ub = input_dict.get("dst_ub") + ub_tail = input_dict.get("ub_tail") + tail_offset = input_dict.get("tail_offset") + tail_num = input_dict.get("tail_num") + block_num = input_dict.get("block_num") + row_index = input_dict.get("row_index") + out_index = input_dict.get("out_index") + tail_start = input_dict.get("tail_start") + total_loop = input_dict.get("total_loop") + r_i = input_dict.get("r_i") + num = input_dict.get("num") + _, col_len, row_len = shape + col_len_align = (col_len + 15) // 16 * 16 + with self.instance.if_scope( + tik.all(row_index >= num, block_num > 1)): + scalar = self.instance.Scalar(ub_tail.dtype) + with self.instance.for_range(0, col_len) as time: + scalar.set_as(dst_ub[r_i * col_len_align + time]) + ub_tail[tail_offset + time].set_as(scalar) + tail_offset.set_as(tail_offset + col_len) + with self.instance.if_scope( + row_index == total_loop * row_len - 1): + each_burst_num = 32 // self.dsize + n_burst = self.instance.Scalar("int32") + n_burst.set_as((tail_num * self.dsize) // 32) + mod = self.instance.Scalar("int32") + mod.set_as((tail_num * self.dsize) % 32) + # 32b alignment + with self.instance.if_scope(mod == 0): + self.instance.data_move(self.y_gm[tail_start], ub_tail, 0, + 1, n_burst, 0, 0) + # bigger than 32b + with self.instance.else_scope(): + self.instance.data_move(self.y_gm[tail_start], ub_tail, 0, + 1, n_burst, 0, 0) + offset = tail_num - each_burst_num + scalar = self.instance.Scalar(ub_tail.dtype) + with self.instance.for_range(0, each_burst_num) as time: + scalar.set_as(ub_tail[offset + time]) + ub_tail[time].set_as(scalar) + self.instance.data_move(self.y_gm[tail_start + offset], + ub_tail, 0, 1, 1, 0, 0) + with self.instance.else_scope(): + burst_len = col_len_align // 16 + self.instance.data_move( + self.y_gm[out_index], + dst_ub[r_i * col_len_align], + 0, 1, burst_len, 0, 0) + + def compute_c_lt_16(self, input_dict): + """ + processing the scenario where c is less than 16 + """ + n_id = input_dict.get("n_id") + total_loop = input_dict.get("each_loop") + tail_offset = input_dict.get("tail_offset") + ub_tail = input_dict.get("ub_tail") + shape = input_dict.get("shape") + x_index = input_dict.get("x_index") + block_num = input_dict.get("block_num") + _, col_len, row_len = shape + col_len_align = (col_len + 15) // 16 * 16 + row_len_ub = self.ub_size // col_len_align // 16 * 16 + row_loop = row_len // row_len_ub + row_mod = row_len % row_len_ub + last_num = (16 + col_len - 1) // col_len + num = total_loop * row_len - last_num + src_ub = self.instance.Tensor(self.dtype, (self.ub_size,), + name="src_ub", + scope=tik.scope_ubuf) + dst_ub = self.instance.Tensor(self.dtype, (self.ub_size,), + name="dst_ub", + scope=tik.scope_ubuf) + if row_loop > 0: + with self.instance.for_range(0, row_loop) as r_loop: + in_index = x_index + n_id * col_len * row_len + \ + row_len_ub * r_loop + self.move_gm_to_ub(row_len, col_len, row_len_ub, src_ub, + in_index) + self.trans_scatter(col_len_align, row_len_ub, src_ub, + dst_ub) + with self.instance.for_range(0, row_len_ub) as r_i: + row_index = n_id * row_len + row_len_ub * r_loop + r_i + out_index = x_index + n_id * col_len * row_len + \ + col_len * row_len_ub * r_loop + r_i * col_len + tail_start = x_index + total_loop * row_len * col_len - \ + last_num * col_len + input_dict = { + "shape": shape, + "dst_ub": dst_ub, + "ub_tail": ub_tail, + "tail_offset": tail_offset, + "tail_num": col_len * last_num, + "block_num": block_num, + "row_index": row_index, + "out_index": out_index, + "tail_start": tail_start, + "total_loop": total_loop, + "r_i": r_i, + "num": num, + } + self.move_ub_to_gm_with_tail(input_dict) + + if row_mod > 0: + in_index = x_index + n_id * col_len * row_len + \ + row_len_ub * row_loop + self.move_gm_to_ub(row_len, col_len, row_mod, src_ub, in_index) + row_mod_align = (row_mod + 15) // 16 * 16 + self.trans_scatter(col_len_align, row_mod_align, src_ub, dst_ub) + with self.instance.for_range(0, row_mod) as r_i: + row_index = n_id * row_len + row_len_ub * row_loop + r_i + out_index = x_index + n_id * col_len * row_len + \ + col_len * row_len_ub * row_loop + r_i * col_len + tail_start = x_index + total_loop * row_len * col_len - \ + last_num * col_len + input_dict = { + "shape": shape, + "dst_ub": dst_ub, + "ub_tail": ub_tail, + "tail_offset": tail_offset, + "tail_num": col_len * last_num, + "block_num": block_num, + "row_index": row_index, + "out_index": out_index, + "tail_start": tail_start, + "total_loop": total_loop, + "r_i": r_i, + "num": num, + } + self.move_ub_to_gm_with_tail(input_dict) + + def compute_c_ge_16(self, shape, x_index): + """ + processing the scenario where the value of c is greater than + or equal to 16 + """ + _, col_len, row_len = shape + ub_div_16 = self.ub_size // 16 + col_div_16 = col_len // 16 * 16 + col_len_ub = ub_div_16 if ub_div_16 < col_div_16 else col_div_16 + ub_div_col = self.ub_size // col_len_ub // 16 * 16 + row_len_ub = ub_div_col if ub_div_col < row_len else row_len + row_len_ub_align = (row_len_ub + 15) // 16 * 16 + col_loop = col_len // col_len_ub + col_mod = col_len % col_len_ub + row_loop = row_len // row_len_ub + row_mod = row_len % row_len_ub + src_ub = self.instance.Tensor(self.dtype, (self.ub_size,), + name="src_ub", + scope=tik.scope_ubuf) + dst_ub = self.instance.Tensor(self.dtype, (self.ub_size,), + name="dst_ub", + scope=tik.scope_ubuf) + if col_loop > 0: + with self.instance.for_range(0, col_loop) as c_loop: + with self.instance.for_range(0, row_loop) as r_loop: + in_index = x_index + c_loop * col_len_ub * row_len + \ + row_len_ub * r_loop + self.move_gm_to_ub(row_len, col_len_ub, row_len_ub, src_ub, + in_index) + self.trans_scatter(col_len_ub, row_len_ub_align, src_ub, + dst_ub) + out_index = x_index + col_len * row_len_ub * r_loop + \ + c_loop * col_len_ub + self.move_ub_to_gm(col_len, col_len_ub, row_len_ub, + out_index, dst_ub) + if row_mod > 0: + in_index = x_index + c_loop * col_len_ub * row_len + \ + row_len_ub * row_loop + row_mod_align = (row_mod + 15) // 16 * 16 + self.move_gm_to_ub( + row_len, col_len_ub, row_mod, src_ub, in_index) + self.trans_scatter(col_len_ub, row_mod_align, src_ub, + dst_ub) + out_index = x_index + col_len * row_len_ub * row_loop + \ + c_loop * col_len_ub + self.move_ub_to_gm(col_len, col_len_ub, row_mod, + out_index, dst_ub) + if col_mod > 0: + col_mod_align = (col_mod + 15) // 16 * 16 + offset = col_mod_align - col_mod + with self.instance.for_range(0, row_loop) as r_loop: + in_index = x_index + (col_loop * col_len_ub - offset) * \ + row_len + row_len_ub * r_loop + self.move_gm_to_ub( + row_len, col_mod_align, row_len_ub, src_ub, in_index) + self.trans_scatter(col_mod_align, row_len_ub_align, src_ub, + dst_ub) + out_index = x_index + col_len * row_len_ub * r_loop + \ + col_loop * col_len_ub - offset + self.move_ub_to_gm(col_len, col_mod_align, row_len_ub, + out_index, dst_ub) + if row_mod > 0: + in_index = x_index + (col_loop * col_len_ub - offset) * \ + row_len + row_len_ub * row_loop + self.move_gm_to_ub(row_len, col_mod_align, row_mod, src_ub, + in_index) + self.trans_scatter(col_mod_align, row_mod_align, src_ub, + dst_ub) + out_index = x_index + col_len * row_len_ub * row_loop + \ + col_loop * col_len_ub - offset + self.move_ub_to_gm(col_len, col_mod_align, row_mod, + out_index, dst_ub) + + def permute_compute(self): + """ + compute permute + """ + shape, order, _ = self.get_shape_info() + if order != [0, 2, 1]: + self.move_without_transform(shape) + else: + _, col_len, row_len = shape + block_num, inner_loop, tail, thread_num = \ + get_block_num_and_loop_cycle(shape) + element_num = col_len * row_len + + with self.instance.for_range(0, block_num, block_num=block_num) \ + as block_id: + each_loop = self.instance.Scalar("int32") + each_loop.set_as(inner_loop) + offset = self.instance.Scalar("int32") + if tail > 0: + with self.instance.if_scope(block_id < tail): + each_loop.set_as(each_loop + 1) + offset.set_as(block_id * each_loop) + if tail > 0: + with self.instance.if_scope(block_id >= tail): + offset.set_as(block_id * each_loop + tail) + x_index = self.instance.Scalar("int32") + x_index.set_as(offset * element_num) + ub_tail = self.instance.Tensor(self.dtype, (256,), + name="ub_tail", + scope=tik.scope_ubuf) + tail_offset = self.instance.Scalar("int32") + tail_offset.set_as(0) + with self.instance.for_range(0, + each_loop, + thread_num=thread_num) as n_id: + if col_len >= 16: + index = self.instance.Scalar("int32") + index.set_as(x_index + n_id * col_len * row_len) + self.compute_c_ge_16(shape, index) + else: + input_dict = { + "n_id": n_id, + "each_loop": each_loop, + "tail_offset": tail_offset, + "ub_tail": ub_tail, + "shape": shape, + "x_index": x_index, + "block_num": block_num + } + self.compute_c_lt_16(input_dict) diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/scatter_nd_add.py b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/scatter_nd_add.py new file mode 100644 index 000000000..b6ab2cf8e --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/scatter_nd_add.py @@ -0,0 +1,860 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +""" +Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +This program is free software; you can redistribute it and/or modify +it under the terms of the Apache License Version 2.0.You may not use this file + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +Apache License for more details at +http://www.apache.org/licenses/LICENSE-2.0 + +scatter_nd_add +""" +import math +from functools import reduce as functools_reduce + +from tbe import tik +import te.platform as tbe_platform +from tbe.common.platform import platform_info +from tbe.common.utils import para_check + +# neg two +NEG_TWO = -2 + +# neg one +NEG_ONE = -1 + + +# pylint: disable=too-many-arguments,too-many-instance-attributes +class Scatter(): + """ + Function: use to store scatter base parameters + Modify : 2019-10-28 + """ + + # pylint: disable=too-many-statements + def __init__(self, var, indices, updates, var_out, nd_flag, kernel_name, + compute_type): + """ + Init scatter base parameters + + Parameters + ---------- + var: dict + data of input + datatype suports float32,float16,int32,int8,uint8 + indices: dict + data of indices + datatype supports int32 + updates: dict + data of updates + datatype supports float32,float16,int32,int8,uint8 + var_out: dict + data of input + nd_flag: bool + if this op is nd operator + kernel_name: str + the name of the operator + compute_type: str + the compute type of scatter + Returns + ------- + None + """ + self.tik_instance = tik.Tik() + self.nd_flag = nd_flag + self.var_shape = var.get("shape") + self.var_dtype = var.get("dtype").lower() + self.indices_shape = indices.get("shape") + self.indices_dtype = indices.get("dtype").lower() + self.updates_shape = updates.get("shape") + self.updates_dtype = updates.get("dtype").lower() + self.var_ele_num = functools_reduce(lambda x, y: x * y, self.var_shape) + self.indices_num = functools_reduce(lambda x, y: x * y, + self.indices_shape) + self.updates_num = functools_reduce(lambda x, y: x * y, + self.updates_shape) + self.kernel_name = kernel_name + + if self.indices_shape == (1,) and \ + len(self.var_shape)-len(self.updates_shape) == 1: + if not nd_flag: + self.updates_shape = (1,) + self.updates_shape + + self.check_param(var_out) + if nd_flag: + if self.indices_shape[-1] == len(self.var_shape): + self.update_data_num = 1 + else: + self.update_data_num = functools_reduce( + lambda x, y: x * y, self.var_shape[self.indices_shape[-1]:]) + self.max_indice = functools_reduce( + lambda x, y: x * y, self.var_shape[0:self.indices_shape[-1]]) + self.index_dims = self.indices_shape[-1] + else: + if len(self.var_shape) > 1: + self.update_data_num = functools_reduce(lambda x, y: x * y, + self.var_shape[1:]) + else: + self.update_data_num = 1 + self.max_indice = self.var_shape[0] + self.index_dims = 1 + + self.compute_type = compute_type + + self.ub_size_bytes = ( + platform_info.get_soc_spec( + platform_info.UB_SIZE) - 8192) + self.var_dtype_bytes_size = tbe_platform.get_bit_len( + self.var_dtype) // 8 + self.indices_dtype_bytes_size = tbe_platform.get_bit_len( + self.indices_dtype) // 8 + self.var_data_each_block = 32 // self.var_dtype_bytes_size + self.indices_data_each_block = 32 // self.indices_dtype_bytes_size + self.indices_ub_number = 0 + self.updates_ub_number = 0 + + self.index_loop_num = 0 + + self.max_num_one_repeat = 128 + if self.var_dtype in ("float32", "int32"): + self.max_num_one_repeat = 64 + + if self.update_data_num < self.var_data_each_block: + self.block_num = 1 + else: + ai_core_num = platform_info.get_soc_spec( + platform_info.CORE_NUM) + self.indice_step = math.ceil(self.max_indice / ai_core_num) + self.block_num = math.ceil(self.max_indice / self.indice_step) + + self.var_gm = self.tik_instance.Tensor( + self.var_dtype, self.var_shape, name="var_gm", scope=tik.scope_gm) + self.indices_gm = self.tik_instance.Tensor( + self.indices_dtype, + self.indices_shape, + name="indices_gm", + scope=tik.scope_gm) + self.updates_gm = self.tik_instance.Tensor( + self.updates_dtype, + self.updates_shape, + name="updates_gm", + scope=tik.scope_gm) + self.out_gm = self.tik_instance.Tensor( + self.var_dtype, self.var_shape, name="out_gm", scope=tik.scope_gm) + + self.vconv_dst_dtype = "float16" + + self.init_ub_tensor_para() + self.var_vconv_ub = None + self.updates_vconv_ub = None + self.var_tile_vconv_ub = None + self.updates_tile_vconv_ub = None + + self.var_ub = None + self.updates_ub = None + self.indices_ub = None + self.var_tile_ub = None + self.updates_tile_ub = None + + self.var_read_index = None + self.updates_read_index = None + self.indices_loop_index = None + self.indices_tmp = None + + def init_ub_tensor_para(self): + """ + Compute the ub size of tensors + + Parameters + ---------- + None + + Returns + ------- + None + """ + updates_size_bytes = self.var_dtype_bytes_size * self.update_data_num + indices_size_bytes = self.indices_dtype_bytes_size * self.indices_num + + need_vconv_dtype = ("int8", "uint8") + if self.var_dtype in need_vconv_dtype: + vconv_dtype_bytes_size = tbe_platform.get_bit_len( + self.vconv_dst_dtype) + vconv_data_each_block = 32 // vconv_dtype_bytes_size + vconv_size_bytes = ( + updates_size_bytes // self.var_dtype_bytes_size * + vconv_dtype_bytes_size) + if (updates_size_bytes + vconv_size_bytes) * 2 < ( + self.ub_size_bytes * 0.9): + self.updates_ub_number = math.ceil( + self.update_data_num / + self.var_data_each_block) * self.var_data_each_block + + self.vconv_ub_number = math.ceil( + self.update_data_num / + vconv_data_each_block) * vconv_data_each_block + + self.indices_ub_number = ( + self.ub_size_bytes - updates_size_bytes * 2 - + vconv_size_bytes * 2) // self.indices_dtype_bytes_size + + self.indices_ub_number = math.ceil( + self.indices_ub_number / + self.indices_data_each_block) * self.indices_data_each_block + + elif indices_size_bytes < (self.ub_size_bytes * 0.9): + self.indices_ub_number = math.ceil( + self.indices_num / + self.indices_data_each_block) * self.indices_data_each_block + self.updates_ub_number = ( + self.ub_size_bytes - + indices_size_bytes) // self.var_dtype_bytes_size // 6 + + self.updates_ub_number = math.ceil( + self.updates_ub_number / + self.var_data_each_block) * self.var_data_each_block + + self.vconv_ub_number = math.ceil( + self.updates_ub_number / + vconv_data_each_block) * vconv_data_each_block + + else: + self.updates_ub_number = ( + self.ub_size_bytes // 2 // + (vconv_dtype_bytes_size + self.var_dtype_bytes_size) // 2 // + self.var_data_each_block * self.var_data_each_block) + self.indices_ub_number = ( + self.ub_size_bytes // self.indices_dtype_bytes_size // 2 // + self.var_data_each_block * self.var_data_each_block) + self.vconv_ub_number = self.updates_ub_number + + else: + if updates_size_bytes * 2 < self.ub_size_bytes * 0.9: + self.updates_ub_number = math.ceil( + self.update_data_num / + self.var_data_each_block) * self.var_data_each_block + self.indices_ub_number = ( + self.ub_size_bytes - + updates_size_bytes * 2) // self.indices_dtype_bytes_size + self.indices_ub_number = math.ceil( + self.indices_ub_number / + self.indices_data_each_block) * self.indices_data_each_block + if self.indices_num < self.indices_ub_number: + self.indices_ub_number = math.ceil( + self.indices_num / self.indices_data_each_block + ) * self.indices_data_each_block + elif indices_size_bytes < self.ub_size_bytes * 0.9: + self.indices_ub_number = math.ceil( + self.indices_num / + self.indices_data_each_block) * self.indices_data_each_block + + self.updates_ub_number = ( + self.ub_size_bytes - + indices_size_bytes) // 2 // self.var_dtype_bytes_size + + self.updates_ub_number = math.ceil( + self.updates_ub_number / + self.var_data_each_block) * self.var_data_each_block + else: + self.indices_ub_number = ( + self.ub_size_bytes // self.indices_dtype_bytes_size // 2 // + self.indices_data_each_block * self.indices_data_each_block) + self.updates_ub_number = ( + self.indices_ub_number // 2 // self.var_data_each_block * + self.var_data_each_block) + + last_num = self.update_data_num % self.updates_ub_number + if (last_num < self.var_data_each_block and + self.update_data_num > self.updates_ub_number): + self.updates_ub_number -= self.var_data_each_block + + def init_ub_tensor(self): + """ + Compute the ub size of tensors + + Parameters + ---------- + None + + Returns + ------- + None + """ + need_vconv_dtype = ("int8", "uint8") + if self.var_dtype in need_vconv_dtype: + self.var_vconv_ub = self.tik_instance.Tensor( + self.vconv_dst_dtype, (self.vconv_ub_number,), + name="var_vconv_ub", + scope=tik.scope_ubuf) + self.updates_vconv_ub = self.tik_instance.Tensor( + self.vconv_dst_dtype, (self.vconv_ub_number,), + name="updates_vconv_ub", + scope=tik.scope_ubuf) + + self.var_tile_vconv_ub = self.tik_instance.Tensor( + self.vconv_dst_dtype, (self.var_data_each_block,), + name="var_tile_vconv_ub", + scope=tik.scope_ubuf) + self.updates_tile_vconv_ub = self.tik_instance.Tensor( + self.vconv_dst_dtype, (self.var_data_each_block,), + name="updates_tile_vconv_ub", + scope=tik.scope_ubuf) + + self.var_ub = self.tik_instance.Tensor( + self.var_dtype, (self.updates_ub_number,), + name="var_ub", + scope=tik.scope_ubuf) + self.updates_ub = self.tik_instance.Tensor( + self.updates_dtype, (self.updates_ub_number,), + name="updates_ub", + scope=tik.scope_ubuf) + self.indices_ub = self.tik_instance.Tensor( + self.indices_dtype, (self.indices_ub_number,), + name="indices_ub", + scope=tik.scope_ubuf) + + self.var_tile_ub = self.tik_instance.Tensor( + self.var_dtype, (self.var_data_each_block,), + name="var_tile_ub", + scope=tik.scope_ubuf) + self.updates_tile_ub = self.tik_instance.Tensor( + self.updates_dtype, (self.var_data_each_block,), + name="updates_tile_ub", + scope=tik.scope_ubuf) + + self.var_read_index = self.tik_instance.Scalar("int32") + self.var_read_index.set_as(0) + + self.updates_read_index = self.tik_instance.Scalar("int32") + self.updates_read_index.set_as(0) + + self.indices_loop_index = self.tik_instance.Scalar("int32") + self.indices_loop_index.set_as(0) + + self.indices_tmp = self.tik_instance.Scalar("int32") + self.indices_tmp.set_as(0) + + def get_var_read_index(self, indices_ub_index): + """ + Calculate the index of the read var + + Parameters + ---------- + indices_ub_index: int32 + the index of the currently traversed indices in UB + + Returns + ------- + None + """ + if not self.nd_flag: + self.var_read_index.set_as(self.indices_ub[indices_ub_index]) + else: + indices_ub_index = indices_ub_index * self.indices_shape[-1] + self.var_read_index.set_as(0) + if self.indices_shape[-1] == 1: + self.var_read_index.set_as(self.indices_ub[indices_ub_index]) + else: + for i in range(0, self.indices_shape[-1]): + self.indices_tmp.set_as(self.indices_ub[indices_ub_index + + i]) + if i + 1 < self.indices_shape[-1]: + self.var_read_index.set_as( + self.var_read_index + + self.indices_tmp * functools_reduce( + lambda x, y: x * y, + self.var_shape[i + 1:self.indices_shape[-1]])) + else: + self.var_read_index.set_as(self.var_read_index + + self.indices_tmp) + + def get_updates_read_index(self, indices_ub_index): + """ + Calculate the index of the read updates + + Parameters + ---------- + indices_ub_index:int32 + the index of the currently traversed indices in UB + + Returns + ------- + None + """ + read_index = indices_ub_index * self.update_data_num + self.updates_read_index.set_as(read_index) + + def updates_the_var(self, indices_in_index, indice_num): + """ + Update the update fragment corresponding to the index + + Parameters + ---------- + indices_in_index: int32 + Indices index on GM + indice_num: int32 + the number of indexes in the indices on UB + Returns + ------- + None + """ + indices_burst_len = math.ceil(indice_num / self.indices_data_each_block) + if self.indices_num == 1: + self.tik_instance.data_move(self.indices_ub, self.indices_gm, 0, 1, + indices_burst_len, 0, 0) + else: + self.tik_instance.data_move(self.indices_ub, + self.indices_gm[indices_in_index], 0, 1, + indices_burst_len, 0, 0) + if self.nd_flag: + indice_loop_num = indice_num // self.indices_shape[-1] + else: + indice_loop_num = indice_num + + with self.tik_instance.for_range(0, + indice_loop_num) as indices_ub_index: + self.get_var_read_index(indices_ub_index) + if self.block_num > 1: + with self.tik_instance.if_scope( + self.indices_loop_index * + self.indice_step <= self.var_read_index): + with self.tik_instance.if_scope( + (self.indices_loop_index + 1) * + self.indice_step > self.var_read_index): + if self.nd_flag: + indices_in_index = indices_in_index // self.indices_shape[ + -1] + self.get_updates_read_index(indices_ub_index + + indices_in_index) + self.var_read_index.set_as(self.var_read_index * + self.update_data_num) + self.calc_updates() + else: + if self.nd_flag: + indices_in_index = indices_in_index // self.indices_shape[-1] + self.get_updates_read_index(indices_ub_index + indices_in_index) + self.var_read_index.set_as(self.var_read_index * + self.update_data_num) + self.calc_updates() + + def calc_updates(self): + """ + Calculate updates fragment + + Parameters + ---------- + None + + Returns + ------- + None + """ + updates_loop = self.update_data_num // self.updates_ub_number + if updates_loop > 0: + with self.tik_instance.for_range(0, updates_loop) as loop_index: + self.calc_updates_small(loop_index * self.updates_ub_number, + self.updates_ub_number) + + last_num = self.update_data_num % self.updates_ub_number + if last_num > 0: + self.calc_updates_small(updates_loop * self.updates_ub_number, + last_num) + + def calc_updates_small(self, read_index_offset, element_num): + """ + Transfer update to UB and calculate + + Parameters + ---------- + read_index_offset: int32 + the offset used to read the updates fragment + element_num: + the number of elements in the slice of updates + + Returns + ------- + None + """ + updates_burst_len = math.ceil(element_num / self.var_data_each_block) + self.tik_instance.data_move( + self.var_ub, self.var_gm[self.var_read_index + read_index_offset], + 0, 1, updates_burst_len, 0, 0) + + self.tik_instance.data_move( + self.updates_ub, + self.updates_gm[self.updates_read_index + read_index_offset], 0, 1, + updates_burst_len, 0, 0) + + tile_ele_num = element_num % self.var_data_each_block + align_offset = 0 + if (tile_ele_num != 0 and + self.update_data_num > self.var_data_each_block): + align_ele_num = ( + element_num // self.var_data_each_block * + self.var_data_each_block) + align_offset = ( + read_index_offset + align_ele_num - + (self.var_data_each_block - tile_ele_num)) + self.tik_instance.data_move( + self.var_tile_ub, + self.var_gm[self.var_read_index + align_offset], 0, 1, 1, 0, 0) + + self.tik_instance.data_move( + self.updates_tile_ub, + self.updates_gm[self.updates_read_index + align_offset], 0, 1, + 1, 0, 0) + + compute_loop = element_num // self.max_num_one_repeat // 255 + + if compute_loop > 0: + with self.tik_instance.for_range(0, compute_loop) as index: + index_offset = index * self.max_num_one_repeat * 255 + self.calc_process(self.max_num_one_repeat, index_offset, + index_offset, index_offset, 255, False) + last_loop = element_num % (self.max_num_one_repeat * + 255) // self.max_num_one_repeat + + if last_loop > 0: + index_offset = compute_loop * self.max_num_one_repeat * 255 + self.calc_process(self.max_num_one_repeat, index_offset, + index_offset, index_offset, last_loop, False) + + compute_mask = element_num % self.max_num_one_repeat + if compute_mask > 0: + index_offset = ( + element_num // self.max_num_one_repeat * + self.max_num_one_repeat) + if (tile_ele_num == 0 or + self.update_data_num < self.var_data_each_block): + self.calc_process(compute_mask, index_offset, index_offset, + index_offset, 1, False) + + self.tik_instance.data_move( + self.out_gm[self.var_read_index + read_index_offset], + self.var_ub, 0, 1, updates_burst_len, 0, 0) + else: + self.calc_process(self.var_data_each_block, 0, 0, 0, 1, True) + self.tik_instance.data_move( + self.out_gm[self.var_read_index + align_offset], + self.var_tile_ub, 0, 1, 1, 0, 0) + self.calc_process(compute_mask, index_offset, index_offset, + index_offset, 1, False) + self.tik_instance.data_move( + self.out_gm[self.var_read_index + read_index_offset], + self.var_ub, 0, 1, updates_burst_len - 1, 0, 0) + else: + self.tik_instance.data_move( + self.out_gm[self.var_read_index + read_index_offset], + self.var_ub, 0, 1, updates_burst_len, 0, 0) + + def calc_process(self, mask, dest_addr, src_addr1, src_addr2, repeat_times, + is_tile): + """ + Execute the corresponding calculation instruction + + Parameters + ---------- + mask: int + the mask of instruction + dest_addr: int + testination address offset + src_addr1: int + src1 address offset + src_addr2: int + src2 address offset + repeat_times: int + the repeat times of instruction + is_tile: bool + determine whether the currently calculated data is the tail of var + and updates + + Returns + ------- + None + """ + need_vconv_dtype = ("int8", "uint8") + if self.var_dtype in need_vconv_dtype: + if is_tile: + self.tik_instance.vconv(mask, "", + self.var_tile_vconv_ub[dest_addr], + self.var_tile_ub[src_addr1], + repeat_times, 1, 1, 8, 4) + self.tik_instance.vconv(mask, "", + self.updates_tile_vconv_ub[dest_addr], + self.updates_tile_ub[src_addr2], + repeat_times, 1, 1, 8, 4) + compute_repeat_strid = 8 + src1_ub = self.var_tile_vconv_ub + src2_ub = self.updates_tile_vconv_ub + dst_ub = self.var_tile_vconv_ub + mask = self.var_data_each_block + else: + self.tik_instance.vconv(mask, "", self.var_vconv_ub[dest_addr], + self.var_ub[src_addr1], repeat_times, 1, + 1, 8, 4) + self.tik_instance.vconv(mask, "", + self.updates_vconv_ub[dest_addr], + self.updates_ub[src_addr2], + repeat_times, 1, 1, 8, 4) + compute_repeat_strid = 8 + src1_ub = self.var_vconv_ub[src_addr1] + src2_ub = self.updates_vconv_ub[src_addr2] + dst_ub = self.var_vconv_ub[dest_addr] + + else: + if is_tile: + compute_repeat_strid = ( + self.max_num_one_repeat // self.var_data_each_block) + src1_ub = self.var_tile_ub + src2_ub = self.updates_tile_ub + dst_ub = self.var_tile_ub + mask = self.var_data_each_block + else: + compute_repeat_strid = ( + self.max_num_one_repeat // self.var_data_each_block) + src1_ub = self.var_ub[src_addr1] + src2_ub = self.updates_ub[src_addr2] + dst_ub = self.var_ub[dest_addr] + + if self.compute_type == "vadd": + self.tik_instance.vec_add(mask, dst_ub, src1_ub, src2_ub, repeat_times, + compute_repeat_strid,compute_repeat_strid, + compute_repeat_strid) + elif self.compute_type == "vsub": + self.tik_instance.vec_sub(mask, dst_ub, src1_ub, src2_ub, repeat_times, + compute_repeat_strid, compute_repeat_strid, + compute_repeat_strid) + elif self.compute_type == "vdiv": + if platform_info.api_check_support("tik.vdiv", "float32"): + self.tik_instance.vdiv(mask, dst_ub, src1_ub, src2_ub, + repeat_times, 1, 1, 1, + compute_repeat_strid, + compute_repeat_strid, + compute_repeat_strid) + else: + tmp_tensor = self.tik_instance.Tensor( + src2_ub.dtype, (mask * repeat_times,), + scope=tik.scope_ubuf, + name="tmp_tensor") + self.tik_instance.vrec(mask, tmp_tensor, src2_ub, repeat_times, + 1, 1, compute_repeat_strid, + compute_repeat_strid) + self.tik_instance.vec_mul(mask, src2_ub, src2_ub, tmp_tensor, + repeat_times, + compute_repeat_strid, + compute_repeat_strid, + compute_repeat_strid) + self.tik_instance.vec_adds(mask, src2_ub, src2_ub, NEG_TWO, + repeat_times, + compute_repeat_strid, + compute_repeat_strid) + self.tik_instance.vec_mul(mask, src2_ub, src2_ub, tmp_tensor, + repeat_times, + compute_repeat_strid, + compute_repeat_strid, + compute_repeat_strid) + self.tik_instance.vec_muls(mask, src2_ub, src2_ub, NEG_ONE, + repeat_times, + compute_repeat_strid, + compute_repeat_strid) + # src1_ub * (1/src2_ub) + self.tik_instance.vec_mul(mask, dst_ub, src1_ub, src2_ub, + repeat_times, + compute_repeat_strid, + compute_repeat_strid, + compute_repeat_strid) + elif self.compute_type == "vmax": + self.tik_instance.vmax(mask, dst_ub, src1_ub, src2_ub, repeat_times, + 1, 1, 1, compute_repeat_strid, + compute_repeat_strid, compute_repeat_strid) + elif self.compute_type == "vmin": + self.tik_instance.vmin(mask, dst_ub, src1_ub, src2_ub, repeat_times, + 1, 1, 1, compute_repeat_strid, + compute_repeat_strid, compute_repeat_strid) + elif self.compute_type == "vmul": + self.tik_instance.vec_mul(mask, dst_ub, src1_ub, src2_ub, repeat_times, + compute_repeat_strid, compute_repeat_strid, + compute_repeat_strid) + elif self.compute_type == "update": + self.tik_instance.vec_muls(mask, dst_ub, src1_ub, 0, repeat_times, + compute_repeat_strid, + compute_repeat_strid) + self.tik_instance.vec_add(mask, dst_ub, src1_ub, src2_ub, repeat_times, + compute_repeat_strid, compute_repeat_strid, + compute_repeat_strid) + else: + raise RuntimeError("the operater [%s] is not supported" % + self.compute_type) + if self.var_dtype in need_vconv_dtype: + if is_tile: + self.tik_instance.vconv(mask, "", self.var_tile_ub, + self.var_tile_vconv_ub, repeat_times, 1, + 1, 4, 8) + else: + self.tik_instance.vconv(mask, "", self.var_ub[src_addr1], + self.var_vconv_ub[dest_addr], + repeat_times, 1, 1, 4, 8) + + def traversing_indices(self): + """ + Traversing the index in the indices + + Parameters + ---------- + None + + Returns + ------- + None + """ + max_ub_idx_num = ( + self.indices_ub_number // self.index_dims * self.index_dims) + indices_loop_num = self.indices_num // max_ub_idx_num + + if indices_loop_num > 0: + with self.tik_instance.for_range( + 0, indices_loop_num) as indices_loop_index: + self.updates_the_var(indices_loop_index * max_ub_idx_num, + max_ub_idx_num) + + indices_last_num = self.indices_num % max_ub_idx_num + if indices_last_num > 0: + self.updates_the_var(indices_loop_num * max_ub_idx_num, + indices_last_num) + + def check_param(self, var_out): + """ + Check parameter + + Parameters + ---------- + var_out: dict + data of input + datatype suports float32,float16,int32,int8,uint8 + + Returns + ------- + None + """ + var_out_shape = var_out.get("shape") + var_out_dtype = var_out.get("dtype").lower() + if var_out_dtype == "bool": + var_out_dtype = "int8" + para_check.check_kernel_name(self.kernel_name) + para_check.check_shape_rule(self.var_shape) + para_check.check_shape_rule(self.indices_shape) + para_check.check_shape_rule(self.updates_shape) + para_check.check_shape_rule(var_out_shape) + + para_check.check_shape_size(self.var_shape) + para_check.check_shape_size(self.indices_shape) + para_check.check_shape_size(self.updates_shape) + para_check.check_shape_size(var_out_shape) + + check_list_var = ("float16", "float32", "int32", "int8", "uint8") + check_list_indices = ("int32") + para_check.check_dtype_rule(self.var_dtype, check_list_var) + para_check.check_dtype_rule(self.indices_dtype, check_list_indices) + para_check.check_dtype_rule(self.updates_dtype, check_list_var) + para_check.check_dtype_rule(var_out_dtype, check_list_var) + + if var_out_shape != self.var_shape: + raise RuntimeError( + "var_out's shape must be the same as var's shape") + + if (self.updates_dtype != self.var_dtype or + var_out_dtype != self.var_dtype): + raise RuntimeError( + "updates's datatype and var_out's datatype must be the" + " same as var's datatype") + + if self.nd_flag: + if len(self.indices_shape) < 2: + raise RuntimeError( + "the lenth of indices_shape must be large than 2") + k = self.indices_shape[-1] + updates_len = len(self.indices_shape) - 1 + len(self.var_shape) - k + if k > len(self.var_shape): + raise RuntimeError( + "indices_shape[-1] can not be large than var's rank") + if len(self.updates_shape) != updates_len: + raise RuntimeError("the lenth of update must be len(indices_" + "shape)-1+len(var_shape)-indices_shape[-1]") + updates_true_shape = self.indices_shape[:-1] + self.var_shape[k:] + else: + updates_true_shape = self.indices_shape + self.var_shape[1:] + + if self.updates_shape != updates_true_shape: + raise RuntimeError("updates's shape is illegal") + + def scatter_operator(self): + """ + Scatter operation + + Parameters + ---------- + None + + Returns: + ---------- + tik_instance: tik instance + """ + if self.block_num > 1: + with self.tik_instance.for_range( + 0, self.block_num, + block_num=self.block_num) as indices_loop_index: + self.init_ub_tensor() + self.indices_loop_index.set_as(indices_loop_index) + self.traversing_indices() + else: + self.init_ub_tensor() + self.traversing_indices() + + self.tik_instance.BuildCCE( + kernel_name=self.kernel_name, + inputs=(self.var_gm, self.indices_gm, self.updates_gm), + outputs=(self.out_gm), + enable_l2=False) + + return self.tik_instance + + +# pylint: disable=too-many-arguments,unused-argument +def scatter_nd_add(var, + indices, + updates, + var_out, + use_locking=False, + kernel_name="scatter_nd_add"): + """ + Applies sparse addition to individual values or slices in a Variable. + + Parameters + ---------- + var: dict + data of input. + source data type, support "int8", "uint8", "int32", "float16", "float32" + indices: dict + A tensor of indices into var, support "int32" + updates: dict + data of updates + source data type should ne same as var + var_out: dict + data of output. + use_locking: bool + not used in this compute + kernel_name: str + kernel name, default value is "scatter_nd_add" + + Returns: + None + """ + scatter_nd = Scatter(var, indices, updates, var_out, True, kernel_name, + "vadd") + + scatter_nd.scatter_operator() diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/upsample_tik.py b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/upsample_tik.py new file mode 100644 index 000000000..5168fefbd --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/tbe/impl/upsample_tik.py @@ -0,0 +1,369 @@ +""" +Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. + +This program is free software; you can redistribute it and/or modify +it under the terms of the Apache License Version 2.0.You may not use +this file except in compliance with the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +Apache License for more details at +http://www.apache.org/licenses/LICENSE-2.0 + +upsample +""" +import te.platform as tbe_platform +from tbe.common.utils import para_check +from tbe import tik + +# size of 5HD format +DIM_5HD = 5 +# size of c0 for fp16 fp32 +C0 = 16 +MAX_REPEAT = 255 +RESERVE_SIZE = 16 * 1024 +BLOCK_SIZE = 32 + + +def cal_tilling(x_shape, y_shape, c0_size_in_ub): + """ + calculate tilling + + Parameters + ---------- + x_shape: input x shape + y_shape: output y shape + c0_size_in_ub: number of C0 that can be stored in the UB + + Returns + ------- + out_loop, in_loop, axis, x_shape_in_ub, y_shape_in_ub + """ + y_shape_in_ub = [1] * len(y_shape) + x_shape_in_ub = [1] * len(x_shape) + x_shape_in_ub[-1] = x_shape[-1] + y_shape_in_ub[-1] = y_shape[-1] + + in_loop = 1 + tmp_x_size = 1 + tmp_y_size = 1 + for i in range(len(x_shape) - 2, 0, -1): + axis = i + stride = y_shape[i] // x_shape[i] + pre_x_size = tmp_x_size + pre_y_size = tmp_y_size + tmp_x_size = tmp_x_size * x_shape[i] + tmp_y_size = tmp_y_size * y_shape[i] + + if tmp_x_size + tmp_y_size > c0_size_in_ub: + x_shape_in_ub[i] = c0_size_in_ub // (pre_x_size + pre_y_size) + if x_shape_in_ub[i] > x_shape[i]: + x_shape_in_ub[i] = x_shape[i] + y_shape_in_ub[i] = x_shape_in_ub[i] + if c0_size_in_ub >= pre_x_size + stride * pre_y_size: + x_shape_in_ub[i] = c0_size_in_ub // (pre_x_size + stride * pre_y_size) + y_shape_in_ub[i] = x_shape_in_ub[i] * stride + in_loop = (x_shape[i] + x_shape_in_ub[i] - 1) // x_shape_in_ub[i] + break + x_shape_in_ub[i] = x_shape[i] + y_shape_in_ub[i] = y_shape[i] + out_loop = 1 + + for i in range(1, axis): + out_loop = out_loop * x_shape[i] + return out_loop, in_loop, axis, x_shape_in_ub, y_shape_in_ub + + +def check_shape_dtype_format(input_shape, input_dtype, input_format): + """ + check shape, dtype and format + + Parameters + ---------- + input_shape: input dic shape + input_dtype: input dtype + input_format: input format, NC1HWC0 + The common check rule for tensor shape, just for 5hd + + Returns + ------- + None + """ + tik_name = tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") + if tik_name == "Hi3796CV300ES": + check_list = ["float16"] + else: + check_list = ["float16", "float32"] + if input_dtype not in check_list: + raise RuntimeError("upsample only support %s while dtype is %s" + % (",".join(check_list), input_dtype)) + + para_check.check_shape_rule(input_shape) + if len(input_shape) != DIM_5HD: + raise RuntimeError( + "The dim of tensor must be %d" + ", actual dim is %d" % (DIM_5HD, len(input_shape))) + n, c1, h, w, c0 = input_shape + shape_c0 = C0 + if input_shape[DIM_5HD - 1] != shape_c0: + raise RuntimeError( + "The value of C0 must be 16") + + if input_format != "NC1HWC0": + raise RuntimeError( + "The format must be NC1HWC0, while actual format is %s" % (input_format)) + + +def upsample_check(input_x, stride_h, stride_w, kernel_name="upsample"): + """ + parameters check + + Parameters + ---------- + input_x: dict, include shape dtype and format + stride_h: the shape change axis h + stride_w: the shape change axis w + kernel_name: str, kernel_name + + Returns + ------- + None + """ + input_shape = input_x.get("shape") + input_format = input_x.get("format") + input_dtype = input_x.get("dtype").lower() + if stride_h <= 0 or stride_w <= 0: + raise RuntimeError( + "The stride must be greater than 0") + check_shape_dtype_format(input_shape, input_dtype, input_format) + para_check.check_kernel_name(kernel_name) + + +def get_axis_size_shape(shape, axis): + """ + get size of shape from axis + """ + size = 1 + for i in range(axis + 1, len(shape)): + size = size * shape[i] + return size + + +def cal_out_shape(shape, stride_h, stride_w): + """ + calculate output shape + """ + n, c1, h, w, c0 = shape + out_shape = (n, c1, h * stride_h, w * stride_w, c0) + return out_shape + + +def get_data_size(dtype): + """ + get data size + """ + if dtype == "float16": + dsize = 2 + else: + dsize = 4 + return dsize + + +class Upsample: + def __init__(self, input_dict, stride_h, stride_w): + self.tik_instance = tik.Tik() + self.ub_size = tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) + self.dtype = input_dict.get("x").get("dtype").lower() + self.x_shape = input_dict.get("x").get("shape") + self.dsize = get_data_size(self.dtype) + self.y_shape = cal_out_shape(self.x_shape, stride_h, stride_w) + self.x_gm = self.tik_instance.Tensor(self.dtype, self.x_shape, name="x_gm", + scope=tik.scope_gm) + self.y_gm = self.tik_instance.Tensor(self.dtype, self.y_shape, name="y_gm", + scope=tik.scope_gm) + + def upsample_compute(self, stride_h, stride_w, scale): + """ + calculating data + + Parameters + ---------- + stride_h: the shape change axis h + stride_w: the shape change axis w + scale: the value of tensor change axis + + Returns + ------- + None + """ + # ub_size_in_byte = tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) - RESERVE_SIZE + ub_size_in_byte = self.ub_size - RESERVE_SIZE + + c0_size_in_ub = ub_size_in_byte // self.dsize // self.x_shape[-1] // 2 + out_loop, in_loop, axis, x_shape_in_ub, y_shape_in_ub = cal_tilling(self.x_shape, self.y_shape, c0_size_in_ub) + + x_axis_num = get_axis_size_shape(self.x_shape, axis) + y_axis_num = get_axis_size_shape(self.y_shape, axis) + + n, c1, y_h, y_w, c0 = self.y_shape + c0_stride = c0 * self.dsize // BLOCK_SIZE + block_size = BLOCK_SIZE // self.dsize + + mask = c0 + + if in_loop * out_loop > 1: + thread_num = 2 + else: + thread_num = 1 + + with self.tik_instance.for_range(0, n, block_num=n) as blockid: + c1_id = self.tik_instance.Scalar("uint32", name="c1_id", init_value=0) + c1_size = self.tik_instance.Scalar("uint32", name="c1_size", init_value=x_shape_in_ub[1]) + x_h_id = self.tik_instance.Scalar("uint32", name="x_h_id", init_value=0) + x_h_size = self.tik_instance.Scalar("uint32", name="x_h_size", init_value=x_shape_in_ub[2]) + x_w_id = self.tik_instance.Scalar("uint32", name="x_w_id", init_value=0) + x_w_size = self.tik_instance.Scalar("uint32", name="x_w_size", init_value=x_shape_in_ub[3]) + x_axis_size = self.tik_instance.Scalar("uint32", name="x_axis_size") + y_axis_size = self.tik_instance.Scalar("uint32", name="y_axis_size") + repeats = self.tik_instance.Scalar("uint32", name="repeats") + loop = self.tik_instance.Scalar("uint32", name="loop") + + with self.tik_instance.for_range(0, out_loop * in_loop, thread_num=thread_num) as out_loopid: + x_in_ub = self.tik_instance.Tensor(self.dtype, x_shape_in_ub, scope=tik.scope_ubuf, name="x_in_ub") + y_in_ub = self.tik_instance.Tensor(self.dtype, y_shape_in_ub, scope=tik.scope_ubuf, name="y_in_ub") + + if axis == 1: + # store the c1 start of x + c1_id.set_as(out_loopid * x_shape_in_ub[1]) + c1_size.set_as(x_shape_in_ub[1]) + with self.tik_instance.if_scope(c1_id + c1_size > self.x_shape[axis]): + c1_size.set_as(self.x_shape[axis] - c1_id) + x_axis_size.set_as(c1_size) + y_axis_size.set_as(c1_size * y_shape_in_ub[axis] // x_shape_in_ub[axis]) + x_h_id.set_as(0) + x_w_id.set_as(0) + elif axis == 2: + c1_id.set_as(out_loopid // in_loop) + x_h_id.set_as(out_loopid % in_loop * x_shape_in_ub[axis]) + x_w_id.set_as(0) + c1_size.set_as(1) + x_h_size.set_as(x_shape_in_ub[2]) + with self.tik_instance.if_scope(x_h_size + x_h_id > self.x_shape[axis]): + x_h_size.set_as(self.x_shape[axis] - x_h_id) + x_axis_size.set_as(x_h_size) + y_axis_size.set_as(x_axis_size * y_shape_in_ub[axis] // x_shape_in_ub[axis]) + else: + c1_id.set_as(out_loopid // in_loop // self.x_shape[2] * x_shape_in_ub[1]) + x_h_id.set_as(out_loopid // in_loop % self.x_shape[2] * x_shape_in_ub[2]) + x_w_id.set_as(out_loopid % in_loop * x_shape_in_ub[3]) + c1_size.set_as(1) + x_h_size.set_as(1) + x_w_size.set_as(x_shape_in_ub[3]) + with self.tik_instance.if_scope(x_w_size + x_w_id > self.x_shape[3]): + x_w_size.set_as(self.x_shape[3] - x_w_id) + x_axis_size.set_as(x_w_size) + y_axis_size.set_as(x_w_size * y_shape_in_ub[axis] // x_shape_in_ub[axis]) + self.tik_instance.data_move(x_in_ub[0, 0, 0, 0, 0], self.x_gm[blockid, c1_id, x_h_id, x_w_id, 0], + 0, 1, x_axis_size * x_axis_num * self.dsize // BLOCK_SIZE, 0, 0) + + loop.set_as(x_w_size // MAX_REPEAT) + repeats.set_as(x_w_size % MAX_REPEAT) + + with self.tik_instance.for_range(0, c1_size) as c1_index: + + if axis == 3 or (axis == 2 and y_shape_in_ub[2] == x_shape_in_ub[2]): + for i in range(0, stride_h): + with self.tik_instance.for_range(0, x_h_size) as x_h_index: + with self.tik_instance.for_range(0, stride_w) as stride_w_index: + with self.tik_instance.for_range(0, loop) as loop_w_id: + self.tik_instance.vec_muls( + mask, + y_in_ub[0, c1_index, x_h_index, + MAX_REPEAT * loop_w_id * stride_w + stride_w_index, 0], + x_in_ub[0, c1_index, (x_h_index + i * x_h_size) // stride_h, + MAX_REPEAT * loop_w_id, 0], + scale, MAX_REPEAT, c0_stride * stride_w, c0_stride) + with self.tik_instance.if_scope(repeats > 0): + self.tik_instance.vec_muls( + mask, + y_in_ub[0, c1_index, x_h_index, + MAX_REPEAT * loop * stride_w + stride_w_index, 0], + x_in_ub[0, c1_index, (x_h_index + i * x_h_size) // stride_h, + (MAX_REPEAT * mask * loop) // block_size, 0], + scale, repeats, c0_stride * stride_w, c0_stride) + + self.tik_instance.data_move( + self.y_gm[blockid, c1_id, x_h_id * stride_h + i * x_h_size, x_w_id * stride_w, 0], + y_in_ub[0, 0, 0, 0, 0], + 0, 1, y_axis_size * y_axis_num * self.dsize // BLOCK_SIZE, 0, 0) + + else: + with self.tik_instance.for_range(0, x_h_size * stride_h) as y_h_index: + with self.tik_instance.for_range(0, stride_w) as stride_w_index: + with self.tik_instance.for_range(0, loop) as loop_w_id: + self.tik_instance.vec_muls( + mask, + y_in_ub[0, c1_index, y_h_index, + MAX_REPEAT * loop_w_id * stride_w + stride_w_index, 0], + x_in_ub[0, c1_index, y_h_index // stride_h, (MAX_REPEAT * loop_w_id), 0], + scale, MAX_REPEAT, c0_stride * stride_w, c0_stride) + + with self.tik_instance.if_scope(repeats > 0): + self.tik_instance.vec_muls( + mask, y_in_ub[0, c1_index, y_h_index, + MAX_REPEAT * loop * stride_w + stride_w_index, 0], + x_in_ub[0, c1_index, y_h_index // stride_h, (MAX_REPEAT * loop), 0], + scale, repeats, c0_stride * stride_w, c0_stride) + + with self.tik_instance.if_scope(c1_size == 1): + self.tik_instance.data_move( + self.y_gm[blockid, c1_id, x_h_id * stride_h, x_w_id * stride_w, 0], y_in_ub, 0, 1, + y_axis_size * y_axis_num * self.dsize // BLOCK_SIZE, 1, 1) + + with self.tik_instance.if_scope(c1_size > 1): + self.tik_instance.data_move( + self.y_gm[blockid, c1_id, x_h_id * stride_h, x_w_id * stride_w, 0], + y_in_ub, 0, 1, y_axis_size * y_axis_num * self.dsize // BLOCK_SIZE, 1, 1) + + +@para_check.check_op_params(para_check.REQUIRED_INPUT, para_check.REQUIRED_OUTPUT, + para_check.OPTION_ATTR_FLOAT, para_check.OPTION_ATTR_INT, + para_check.OPTION_ATTR_INT, para_check.KERNEL_NAME) +def upsample_tik(input_x, output_y, scale=1, stride_h=2, stride_w=2, kernel_name="upsample"): + """ + the interface of upsample op + + Parameters + ---------- + input_x: dict + shape and dtype of input + output_y: dict + shape and dtype of output, should be same shape and type as input + scale: float + the value of tensor change axis, default value is 1 + stride_h: int + the shape change axis h + stride_w: int + the shape change axis w + kernel_name: str + kernel name, default value is "upsample" + + Returns + ------- + None + """ + upsample_check(input_x, stride_h, stride_w, kernel_name) + + input_dict = { + "x": input_x, + "y": output_y, + "kernel_name": kernel_name + } + upsample_instance = Upsample(input_dict, stride_h, stride_w) + upsample_instance.upsample_compute(stride_h, stride_w, scale) + upsample_instance.tik_instance.BuildCCE(kernel_name=kernel_name, + inputs=upsample_instance.x_gm, + outputs=upsample_instance.y_gm) + + return upsample_instance.tik_instance -- Gitee