From e23e25aa5f2a3c10a21776c9c16ffae3d0815532 Mon Sep 17 00:00:00 2001 From: hid20190403 Date: Fri, 4 Jul 2025 14:53:42 +0000 Subject: [PATCH 1/6] update /samples/operator_contrib Signed-off-by: hid20190403 --- .../SelectV2Sample/CMakeLists.txt | 25 ++ operator_contrib/SelectV2Sample/README.md | 72 +++++ .../SelectV2Sample/docs/SelectV2.md | 88 ++++++ .../AclNNInvocationNaive/CMakeLists.txt | 60 +++++ .../examples/AclNNInvocationNaive/README.md | 62 +++++ .../examples/AclNNInvocationNaive/gen_data.py | 35 +++ .../examples/AclNNInvocationNaive/main.cpp | 252 ++++++++++++++++++ .../examples/AclNNInvocationNaive/run.sh | 54 ++++ .../AclNNInvocationNaive/verify_result.py | 37 +++ .../SelectV2Sample/framework/CMakeLists.txt | 11 + .../framework/tf_plugin/CMakeLists.txt | 14 + .../tf_plugin/tensorflow_select_v2_plugin.cc | 23 ++ .../SelectV2Sample/op_host/select_v2.cpp | 207 ++++++++++++++ .../SelectV2Sample/op_host/select_v2_tiling.h | 41 +++ .../SelectV2Sample/op_kernel/select_v2.cpp | 239 +++++++++++++++++ 15 files changed, 1220 insertions(+) create mode 100644 operator_contrib/SelectV2Sample/CMakeLists.txt create mode 100644 operator_contrib/SelectV2Sample/README.md create mode 100644 operator_contrib/SelectV2Sample/docs/SelectV2.md create mode 100644 operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/CMakeLists.txt create mode 100644 operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/README.md create mode 100644 operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/gen_data.py create mode 100644 operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/main.cpp create mode 100644 operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/run.sh create mode 100644 operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/verify_result.py create mode 100644 operator_contrib/SelectV2Sample/framework/CMakeLists.txt create mode 100644 operator_contrib/SelectV2Sample/framework/tf_plugin/CMakeLists.txt create mode 100644 operator_contrib/SelectV2Sample/framework/tf_plugin/tensorflow_select_v2_plugin.cc create mode 100644 operator_contrib/SelectV2Sample/op_host/select_v2.cpp create mode 100644 operator_contrib/SelectV2Sample/op_host/select_v2_tiling.h create mode 100644 operator_contrib/SelectV2Sample/op_kernel/select_v2.cpp diff --git a/operator_contrib/SelectV2Sample/CMakeLists.txt b/operator_contrib/SelectV2Sample/CMakeLists.txt new file mode 100644 index 000000000..e4b38c4be --- /dev/null +++ b/operator_contrib/SelectV2Sample/CMakeLists.txt @@ -0,0 +1,25 @@ +add_ops_compile_options( + OP_NAME SelectV2 + OPTIONS --cce-auto-sync=on + -Wno-deprecated-declarations + -Werror +) + +target_sources(op_host_aclnn PRIVATE +op_host/select_v2.cpp +) + +target_sources(optiling PRIVATE + op_host/select_v2.cpp +) + +target_include_directories(optiling PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/op_host +) + +target_sources(opsproto PRIVATE + op_host/select_v2.cpp +) + +install(FILES op_kernel/select_v2.cpp + DESTINATION ${ASCEND_IMPL_OUT_DIR}/dynamic) \ No newline at end of file diff --git a/operator_contrib/SelectV2Sample/README.md b/operator_contrib/SelectV2Sample/README.md new file mode 100644 index 000000000..4a793c2be --- /dev/null +++ b/operator_contrib/SelectV2Sample/README.md @@ -0,0 +1,72 @@ +## `SelectV2`自定义算子样例说明 +本样例通过`Ascend C`编程语言实现了`SelectV2`算子。 + +### 算子描述 +`SelectV2`算子根据输入参数condition从输入参数x1,x2中选取元素,condition值为TRUE时,选取x1,condition为FALSE时,选取x2。 + +### 算子规格描述 + + + + + + + + + + + + + +
算子类型(OpType)SelectV2
nameTypedata typeformat
算子输入
conditiontensorboolND
x1tensorfloat32,float16,int32,int8ND
x2tensorfloat32,float16,int32,int8ND
算子输出ytensorfloat32,float16,int32,int8ND
核函数名select_v2
+ +### 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品 +- Atlas 200I/500 A2推理产品 + +### 目录结构介绍 +``` +├── docs                        // 算子文档目录 +├── example                     // 调用示例目录 +├── framework                   // 第三方框架适配目录 +├── op_host                     // host目录 +├── op_kernel                   // kernel目录 +├── opp_kernel_aicpu        // aicpu目录 +└── tests                       // 测试用例目录 +``` + +### 环境要求 +编译运行此样例前,请参考[《CANN软件安装指南》](https://hiascend.com/document/redirect/CannCommunityInstSoftware)完成开发运行环境的部署。 + +### 算子包编译部署 + - 进入到仓库目录 + + ```bash + cd ${git_clone_path}/cann-ops + ``` + + - 执行编译 + + ```bash + bash build.sh + ``` + + - 部署算子包 + + ```bash + bash build_out/CANN-custom_ops--linux..run + ``` + +### 算子调用 + + + + + +
目录描述
AclNNInvocationNaive通过aclnn调用的方式调用SelectV2算子。
+ +## 更新说明 +| 时间 | 更新事项 | +|----|------| +| 2025/06/10 | 新增本readme | \ No newline at end of file diff --git a/operator_contrib/SelectV2Sample/docs/SelectV2.md b/operator_contrib/SelectV2Sample/docs/SelectV2.md new file mode 100644 index 000000000..c681c4370 --- /dev/null +++ b/operator_contrib/SelectV2Sample/docs/SelectV2.md @@ -0,0 +1,88 @@ +声明:本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议,转载、引用或修改等操作请遵循此许可协议。 + +# SelectV2 + +## 支持的产品型号 + +Atlas A2 训练系列产品/Atlas 200I/500 A2推理产品 + +产品形态详细说明请参见[昇腾产品形态说明](https://www.hiascend.com/document/redirect/CannCommunityProductForm)。 + +## 功能描述 + +- 算子功能:该SelectV2算子提供条件选择功能。SelectV2算子的主要功能是根据condition张量的布尔值(True/False),从x1或x2中选择对应位置的元素,并输出一个新的张量。并支持广播(Broadcasting)机制,允许condition、x、y的形状在满足广播规则的情况下自动扩展,以适配计算需求。它适用于动态条件选择场景,如掩码过滤、条件赋值等操作。 + + +## 实现原理 + +调用`Ascend C`的`API`接口`Compare`、`Select`以及`Cast`进行实现。 + +## 算子执行接口 + +每个算子分为两段式接口,必须先调用“aclnnSelectV2GetWorkspaceSize”接口获取计算所需workspace大小以及包含了算子计算流程的执行器,再调用“aclnnSelectV2”接口执行计算。 + +* `aclnnStatus aclnnSelectV2GetWorkspaceSize(const aclTensor* condition, const aclTensor* x1, const aclTensor* x2, const aclTensor* y, uint64_t* workspaceSize, aclOpExecutor** executor)` +* `aclnnStatus aclnnSelectV2(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor, aclrtStream stream)` + +**说明**: + +- 算子执行接口对外屏蔽了算子内部实现逻辑以及不同代际NPU的差异,且开发者无需编译算子,实现了算子的精简调用。 +- 若开发者不使用算子执行接口的调用算子,也可以定义基于Ascend IR的算子描述文件,通过ATC工具编译获得算子om文件,然后加载模型文件执行算子,详细调用方法可参见《应用开发指南》的[单算子调用 > 单算子模型执行](https://hiascend.com/document/redirect/CannCommunityCppOpcall)章节。 + +### aclnnSelectV2GetWorkspaceSize + +- **参数说明:** + + + - condition(aclTensor\*,计算输入):必选参数,Device侧的aclTensor,公式中的输入condition,数据类型支持BOOL,数据格式支持ND。 + - x1(aclTensor\*,计算输入):必选参数,Device侧的aclTensor,公式中的输入x1,数据类型支持FLOAT32、FLOAT16、INT32、INT8,数据格式支持ND。 + - x2(aclTensor\*,计算输入):必选参数,Device侧的aclTensor,公式中的输入x2,数据类型支持FLOAT32、FLOAT16、INT32、INT8,数据格式支持ND。 + - y(aclTensor\*,计算输出):Device侧的aclTensor,公式中的输出y,数据类型支持FLOAT32、FLOAT16、INT32、INT8,数据格式支持ND,输出维度与x1,x2一致。 + - workspaceSize(uint64\_t\*,出参):返回用户需要在Device侧申请的workspace大小。 + - executor(aclOpExecutor\*\*,出参):返回op执行器,包含了算子计算流程。 +- **返回值:** + + 返回aclnnStatus状态码,具体参见[aclnn返回码](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/aolapi/context/common/aclnn%E8%BF%94%E5%9B%9E%E7%A0%81_fuse.md)。 + + ``` + 第一段接口完成入参校验,若出现以下错误码,则对应原因为: + - 返回161001(ACLNN_ERR_PARAM_NULLPTR):如果传入参数是必选输入,输出或者必选属性,且是空指针,则返回161001。 + - 返回161002(ACLNN_ERR_PARAM_INVALID):x、out的数据类型和数据格式不在支持的范围内。 + ``` + +### aclnnSelectV2 + +- **参数说明:** + + - workspace(void\*,入参):在Device侧申请的workspace内存起址。 + - workspaceSize(uint64\_t,入参):在Device侧申请的workspace大小,由第一段接口aclnnSqrtGetWorkspaceSize获取。 + - executor(aclOpExecutor\*,入参):op执行器,包含了算子计算流程。 + - stream(aclrtStream,入参):指定执行任务的AscendCL stream流。 +- **返回值:** + + 返回aclnnStatus状态码,具体参见[aclnn返回码](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/aolapi/context/common/aclnn%E8%BF%94%E5%9B%9E%E7%A0%81_fuse.md)。 + + +## 约束与限制 + +- condition的数据类型支持BOOL,x1,x2,y的数据类型支持FLOAT32、FLOAT16、INT32,INT8,数据格式只支持ND + +## 算子原型 + + + + + + + + + + + + + +
算子类型(OpType)Sqrt
nametypedata typeformat
算子输入
conditiontensorboolND
x1tensorfloat32,float16,int32,int8ND
x2tensorfloat32,float16,int32,int8ND
算子输出ytensorfloat32,float16,int32,int8ND
核函数名select_v2
+ +## 调用示例 + +详见[SelectV2自定义算子样例说明算子调用章节](../README.md#算子调用) \ No newline at end of file diff --git a/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/CMakeLists.txt b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/CMakeLists.txt new file mode 100644 index 000000000..a5681d624 --- /dev/null +++ b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/CMakeLists.txt @@ -0,0 +1,60 @@ +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_select_v2) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "./") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/") + set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ${INC_PATH}/runtime/include + ${INC_PATH}/atc/include + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${LIB_PATH1} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_select_v2_op + main.cpp +) + +target_link_libraries(execute_select_v2_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_select_v2_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/README.md b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/README.md new file mode 100644 index 000000000..16ebc0114 --- /dev/null +++ b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/README.md @@ -0,0 +1,62 @@ +## 概述 + +通过aclnn调用的方式调用SelectV2算子。 + +## 目录结构介绍 +``` +├── AclNNInvocationNaive +│ ├── CMakeLists.txt // 编译规则文件 +│ ├── gen_data.py // 算子期望数据生成脚本 +│ ├── main.cpp // 单算子调用应用的入口 +│ ├── run.sh // 编译运行算子的脚本 +│ └── verify_result.py // 计算结果精度比对脚本 +``` +## 代码实现介绍 +完成自定义算子的开发部署后,可以通过单算子调用的方式来验证单算子的功能。main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子,无需提供单算子描述文件进行离线模型的转换,直接调用单算子API接口。 + +自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: + ```cpp + aclnnStatus aclnnSelectV2GetWorkspaceSize(const aclTensor *x, const aclTensor *out, uint64_t workspaceSize, aclOpExecutor **executor); + aclnnStatus aclnnSelectV2(void *workspace, int64_t workspaceSize, aclOpExecutor **executor, aclrtStream stream); + ``` +其中aclnnSelectV2GetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnSelectV2执行计算。具体参考[AscendCL单算子调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)>单算子API执行 章节。 + +## 运行样例算子 + **请确保已根据算子包编译部署步骤完成本算子的编译部署动作。** + + - 进入样例代码所在路径 + + ```bash + cd ${git_clone_path}/cann-ops/src/math/selectv2/examples/AclNNInvocationNaive + ``` + + - 环境变量配置 + + 需要设置环境变量,以arm为例 + + ```bash + export DDK_PATH=/usr/local/Ascend/ascend-toolkit/latest + export NPU_HOST_LIB=/usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/devlib + ``` + - 样例执行 + + 样例执行过程中会自动生成测试数据,然后编译与运行aclnn样例,最后打印运行结果。 + + ```bash + mkdir -p build + cd build + cmake .. && make + ./execute_select_v2_op + ``` + + 用户亦可参考run.sh脚本进行编译与运行。 + + ```bash + bash run.sh + ``` + +## 更新说明 + +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/06/29 | 新增本readme | \ No newline at end of file diff --git a/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/gen_data.py b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/gen_data.py new file mode 100644 index 000000000..41e715aed --- /dev/null +++ b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/gen_data.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +import tensorflow as tf + + +def gen_golden_data_simple(): + dtype = np.float16 + input_x1 = np.random.rand(32, 64) > 0.5 + input_x1 = input_x1.astype(np.bool_) + input_x2 = np.random.uniform(1, 100, [32, 64]).astype(np.float16) + input_x3 = np.random.uniform(1, 100, [32, 64]).astype(np.float16) + golden = tf.raw_ops.SelectV2(condition=input_x1, t=input_x2, e=input_x3).numpy() + + os.system("mkdir -p input") + os.system("mkdir -p output") + input_x1.tofile("./input/input_x1.bin") + input_x2.tofile("./input/input_x2.bin") + input_x3.tofile("./input/input_x3.bin") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() + diff --git a/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/main.cpp b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/main.cpp new file mode 100644 index 000000000..cf4cac766 --- /dev/null +++ b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/main.cpp @@ -0,0 +1,252 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/** + * @file main.cpp + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "aclnn_select_v2.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +#define CHECK_RET(cond, return_expr) \ + do { \ + if (!(cond)) { \ + return_expr; \ + } \ + } while (0) + +#define LOG_PRINT(message, ...) \ + do { \ + printf(message, ##__VA_ARGS__); \ + } while (0) + +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +int64_t GetShapeSize(const std::vector &shape) +{ + int64_t shapeSize = 1; + for (auto i : shape) { + shapeSize *= i; + } + return shapeSize; +} + +int Init(int32_t deviceId, aclrtStream *stream) +{ + // 固定写法,acl初始化 + auto ret = aclInit(nullptr); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return FAILED); + ret = aclrtSetDevice(deviceId); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return FAILED); + ret = aclrtCreateStream(stream); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return FAILED); + + return SUCCESS; +} + +template +int CreateAclTensor(const std::vector &hostData, const std::vector &shape, void **deviceAddr, + aclDataType dataType, aclTensor **tensor) +{ + auto size = GetShapeSize(shape) * sizeof(T); + // 调用aclrtMalloc申请device侧内存 + auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return FAILED); + + // 调用aclrtMemcpy将host侧数据拷贝到device侧内存上 + ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return FAILED); + + // 调用aclCreateTensor接口创建aclTensor + *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, nullptr, 0, aclFormat::ACL_FORMAT_ND, shape.data(), + shape.size(), *deviceAddr); + return SUCCESS; +} + +int main(int argc, char **argv) +{ + // 1. (固定写法)device/stream初始化, 参考acl对外接口列表 + // 根据自己的实际device填写deviceId + int32_t deviceId = 0; + aclrtStream stream; + auto ret = Init(deviceId, &stream); + CHECK_RET(ret == 0, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return FAILED); + + // 2. 构造输入与输出,需要根据API的接口自定义构造 + std::vector inputX1Shape = {32, 64}; + std::vector inputX2Shape = {32, 64}; + std::vector inputX3Shape = {32, 64}; + std::vector outputYShape = {32, 64}; + void *inputX1DeviceAddr = nullptr; + void *inputX2DeviceAddr = nullptr; + void *inputX3DeviceAddr = nullptr; + void *outputYDeviceAddr = nullptr; + aclTensor *inputX1 = nullptr; + aclTensor *inputX2 = nullptr; + aclTensor *inputX3 = nullptr; + aclTensor *outputY = nullptr; + size_t inputX1ShapeSize_1=inputX1Shape[0] * inputX1Shape[1]; + size_t inputX2ShapeSize_1=inputX2Shape[0] * inputX2Shape[1]; + size_t inputX3ShapeSize_1=inputX3Shape[0] * inputX3Shape[1]; + size_t outputYShapeSize_1=outputYShape[0] * outputYShape[1]; + size_t dataType1=sizeof(aclFloat16); + size_t dataType2=1; + std::vector inputX1HostData(inputX1Shape[0] * inputX1Shape[1]); + std::vector inputX2HostData(inputX2Shape[0] * inputX2Shape[1]); + std::vector inputX3HostData(inputX3Shape[0] * inputX3Shape[1]); + std::vector outputYHostData(outputYShape[0] * outputYShape[1]); + + size_t fileSize = 0; + void** input1=(void**)(&inputX1HostData); + void** input2=(void**)(&inputX2HostData); + void** input3=(void**)(&inputX3HostData); + //读取数据 + ReadFile("../input/input_x1.bin", fileSize, *input1, inputX1ShapeSize_1*dataType2); + ReadFile("../input/input_x2.bin", fileSize, *input2, inputX2ShapeSize_1*dataType1); + ReadFile("../input/input_x3.bin", fileSize, *input3, inputX3ShapeSize_1*dataType1); + + INFO_LOG("Set input success"); + // 创建inputX aclTensor + ret = CreateAclTensor(inputX1HostData, inputX1Shape, &inputX1DeviceAddr, aclDataType::ACL_BOOL, &inputX1); + CHECK_RET(ret == ACL_SUCCESS, return FAILED); + ret = CreateAclTensor(inputX2HostData, inputX2Shape, &inputX2DeviceAddr, aclDataType::ACL_FLOAT16, &inputX2); + CHECK_RET(ret == ACL_SUCCESS, return FAILED); + ret = CreateAclTensor(inputX3HostData, inputX3Shape, &inputX3DeviceAddr, aclDataType::ACL_FLOAT16, &inputX3); + CHECK_RET(ret == ACL_SUCCESS, return FAILED); + // 创建outputY aclTensor + ret = CreateAclTensor(outputYHostData, outputYShape, &outputYDeviceAddr, aclDataType::ACL_FLOAT16, &outputY); + CHECK_RET(ret == ACL_SUCCESS, return FAILED); + + // 3. 调用CANN自定义算子库API + uint64_t workspaceSize = 0; + aclOpExecutor *executor; + // 计算workspace大小并申请内存 + ret = aclnnSelectV2GetWorkspaceSize(inputX1, inputX2,inputX3, outputY, &workspaceSize, &executor); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnSelectV2GetWorkspaceSize failed. ERROR: %d\n", ret); return FAILED); + void *workspaceAddr = nullptr; + if (workspaceSize > 0) { + ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return FAILED;); + } + // 执行算子 + ret = aclnnSelectV2(workspaceAddr, workspaceSize, executor, stream); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnSelectV2 failed. ERROR: %d\n", ret); return FAILED); + + // 4. (固定写法)同步等待任务执行结束 + ret = aclrtSynchronizeStream(stream); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return FAILED); + + // 5. 获取输出的值,将device侧内存上的结果拷贝至host侧,需要根据具体API的接口定义修改 + auto size = GetShapeSize(outputYShape); + std::vector resultData(size, 0); + ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputYDeviceAddr, + size * sizeof(aclFloat16), ACL_MEMCPY_DEVICE_TO_HOST); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return FAILED); + void** output1=(void**)(&resultData); + //写出数据 + WriteFile("../output/output_y.bin", *output1, outputYShapeSize_1*dataType1); + INFO_LOG("Write output success"); + + // 6. 释放aclTensor,需要根据具体API的接口定义修改 + aclDestroyTensor(inputX1); + aclDestroyTensor(inputX2); + aclDestroyTensor(inputX3); + aclDestroyTensor(outputY); + + // 7. 释放device资源,需要根据具体API的接口定义修改 + aclrtFree(inputX1DeviceAddr); + aclrtFree(inputX2DeviceAddr); + aclrtFree(inputX3DeviceAddr); + aclrtFree(outputYDeviceAddr); + if (workspaceSize > 0) { + aclrtFree(workspaceAddr); + } + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return SUCCESS; +} \ No newline at end of file diff --git a/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/run.sh b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/run.sh new file mode 100644 index 000000000..f1eab8d02 --- /dev/null +++ b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/run.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export DDK_PATH=$_ASCEND_INSTALL_PATH +export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/lib64 + +rm -rf $HOME/ascend/log/* +rm ./input/*.bin +rm ./output/*.bin + +python3 gen_data.py + +if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 +fi +echo "INFO: generate input data success!" +set -e +rm -rf build +mkdir -p build +cmake -B build +cmake --build build -j +( + cd build + ./execute_select_v2_op +) +ret=`python3 verify_result.py output/output_y.bin output/golden.bin` +echo $ret +if [ "x$ret" == "xtest pass" ]; then + echo "" + echo "#####################################" + echo "INFO: you have passed the Precision!" + echo "#####################################" + echo "" +fi \ No newline at end of file diff --git a/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/verify_result.py b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/verify_result.py new file mode 100644 index 000000000..837374d49 --- /dev/null +++ b/operator_contrib/SelectV2Sample/examples/AclNNInvocationNaive/verify_result.py @@ -0,0 +1,37 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import sys +import numpy as np + +LOSS = 1e-3 # 容忍偏差,一般fp16要求绝对误差和相对误差均不超过千分之一 +MINIMUM = 10e-10 + + +def verify_result(real_result, golden): + dtype = np.float16 + real_result = np.fromfile(real_result, dtype=dtype) # 从bin文件读取实际运算结果 + golden = np.fromfile(golden, dtype=dtype) # 从bin文件读取预期运算结果 + result = np.abs(real_result - golden) # 计算运算结果和预期结果偏差 + deno = np.maximum(np.abs(real_result), np.abs(golden)) # 获取最大值并组成新数组 + result_atol = np.less_equal(result, LOSS) # 计算绝对误差 + result_rtol = np.less_equal(result / np.add(deno, MINIMUM), LOSS) # 计算相对误差 + if not result_rtol.all() and not result_atol.all(): + if np.sum(result_rtol == False) > real_result.size * LOSS and \ + np.sum(result_atol == False) > real_result.size * LOSS: # 误差超出预期时返回打印错误,返回对比失败 + print("[ERROR] result error") + return False + print("test pass") + return True + +if __name__ == '__main__': + verify_result(sys.argv[1], sys.argv[2]) diff --git a/operator_contrib/SelectV2Sample/framework/CMakeLists.txt b/operator_contrib/SelectV2Sample/framework/CMakeLists.txt new file mode 100644 index 000000000..b6be9b492 --- /dev/null +++ b/operator_contrib/SelectV2Sample/framework/CMakeLists.txt @@ -0,0 +1,11 @@ +if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mindspore") + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/caffe_plugin") + add_subdirectory(caffe_plugin) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tf_plugin") + add_subdirectory(tf_plugin) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/onnx_plugin") + add_subdirectory(onnx_plugin) + endif() +endif() diff --git a/operator_contrib/SelectV2Sample/framework/tf_plugin/CMakeLists.txt b/operator_contrib/SelectV2Sample/framework/tf_plugin/CMakeLists.txt new file mode 100644 index 000000000..a6aba5c20 --- /dev/null +++ b/operator_contrib/SelectV2Sample/framework/tf_plugin/CMakeLists.txt @@ -0,0 +1,14 @@ + +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} plugin_srcs) +add_library(cust_tf_parsers SHARED ${plugin_srcs}) +target_compile_definitions(cust_tf_parsers PRIVATE google=ascend_private) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_tf_parsers PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_tf_parsers PRIVATE intf_pub graph) +install(TARGETS cust_tf_parsers + LIBRARY DESTINATION packages/vendors/${vendor_name}/framework/tensorflow +) diff --git a/operator_contrib/SelectV2Sample/framework/tf_plugin/tensorflow_select_v2_plugin.cc b/operator_contrib/SelectV2Sample/framework/tf_plugin/tensorflow_select_v2_plugin.cc new file mode 100644 index 000000000..61a92a953 --- /dev/null +++ b/operator_contrib/SelectV2Sample/framework/tf_plugin/tensorflow_select_v2_plugin.cc @@ -0,0 +1,23 @@ +/* Copyright (C) 2020-2021. Huawei Technologies Co., Ltd. All +rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the Apache License Version 2.0. + * You may not use this file except in compliance with the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Apache License for more details at + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#include "register/register.h" + +namespace domi { +// register op info to GE +REGISTER_CUSTOM_OP("SelectV2") + .FrameworkType(TENSORFLOW) // type: CAFFE, TENSORFLOW + .OriginOpType("SelectV2") // name in tf module + .ParseParamsByOperatorFn(AutoMappingByOpFn); +} // namespace domi diff --git a/operator_contrib/SelectV2Sample/op_host/select_v2.cpp b/operator_contrib/SelectV2Sample/op_host/select_v2.cpp new file mode 100644 index 000000000..2d4eeb7e4 --- /dev/null +++ b/operator_contrib/SelectV2Sample/op_host/select_v2.cpp @@ -0,0 +1,207 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "select_v2_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +namespace optiling { +const uint32_t BLOCK_SIZE = 32; +static ge::graphStatus TilingFunc(gert::TilingContext* context) +{ + SelectV2TilingData tiling; + auto shape_x = context->GetInputTensor(0)->GetOriginShape(); + int32_t M = 1, N = 1, Z = 1; + if(shape_x.GetDimNum() > 2) + { + for (int i = 0; i < shape_x.GetDimNum() - 2; i++) + Z *= shape_x.GetDim(i); + M = shape_x.GetDim(shape_x.GetDimNum() - 2); + N = shape_x.GetDim(shape_x.GetDimNum() - 1); + std::cout<<"M:"<GetPlatformInfo()); + auto socVersion = ascendcPlatform.GetSocVersion(); + uint64_t ub_size; + ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size); + auto aivNum = ascendcPlatform.GetCoreNum(); + uint32_t totalLength = context->GetInputTensor(0)->GetShapeSize(); + //获取输入shape信息 + uint32_t inputNum = context->GetInputShape(1)->GetStorageShape().GetShapeSize(); //输入数量 + uint32_t inputBytes = GetSizeByDataType(context->GetInputDesc(1)->GetDataType()); //输入类型 + uint32_t inputLength = inputBytes * inputNum; //输入长度 + auto dt = context->GetInputTensor(0)->GetDataType(); + if(dt == ge::DT_INT8){ + sizeofdatatype = 1; + NUM = 19; + }else if(dt == ge::DT_FLOAT16){ + sizeofdatatype = 2; + NUM = 10; + } + else if (dt == ge::DT_INT32) { + sizeofdatatype = 4; + NUM = 11; + } + else{ //DT_FLOAT + sizeofdatatype = 4; + NUM = 8; + } + uint32_t ALIGN_NUM = BLOCK_SIZE / sizeofdatatype; + uint32_t tiling_size = ((ub_size) / BLOCK_SIZE / 2) / NUM; + tiling_size = tiling_size <= 8 ? tiling_size : tiling_size / 8 * 8; + uint32_t block_size = tiling_size * ALIGN_NUM; + aivNum = (aivNum < totalLength / block_size) ? aivNum : (totalLength / block_size); + aivNum = aivNum >= 1 ? aivNum : 1; + uint32_t core_size; + if(aivNum != 0 && ALIGN_NUM != 0){ + core_size = (totalLength / aivNum) / (ALIGN_NUM * 8) * (ALIGN_NUM * 8); + } + uint32_t core_remain = totalLength - aivNum * core_size; + tiling.set_totalLength(totalLength); + tiling.set_ALIGN_NUM(ALIGN_NUM); + tiling.set_tiling_size(tiling_size); + tiling.set_block_size(block_size); + tiling.set_aivNum(aivNum); + tiling.set_core_size(core_size); + tiling.set_core_remain(core_remain); + uint32_t x2Size = context->GetInputShape(2)->GetStorageShape().GetShapeSize(); + uint32_t cdtSize = context->GetInputShape(0)->GetStorageShape().GetShapeSize(); + uint32_t x1Size = context->GetInputShape(1)->GetStorageShape().GetShapeSize(); + uint32_t ySize = context->GetOutputShape(0)->GetStorageShape().GetShapeSize(); + if(ySize != cdtSize || ySize != x1Size || ySize != x2Size) + { + context->SetTilingKey(2); + int32_t y_ndarray[20], cdt_ndarray[20], x1_ndarray[20],x2_ndarray[20]; + int32_t y_dimensional, cdt_dimensional, x1_dimensional, x2_dimensional; + auto shape_y = context->GetOutputShape(0)->GetOriginShape(); + auto shape_cdt = context->GetInputTensor(0)->GetOriginShape(); + auto shape_x1 = context->GetInputTensor(1)->GetOriginShape(); + auto shape_x2 = context->GetInputTensor(2)->GetOriginShape(); + y_dimensional = shape_y.GetDimNum(); + cdt_dimensional = shape_cdt.GetDimNum(); + x1_dimensional = shape_x1.GetDimNum(); + x2_dimensional = shape_x2.GetDimNum(); + for(int i = 0; i < y_dimensional; i++) + { + y_ndarray[y_dimensional-i-1] = shape_y.GetDim(i); + if(iSetTilingKey(1); + } + context->SetBlockDim(1); + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + return ge::GRAPH_SUCCESS; +} +} + +namespace ge { +static ge::graphStatus InferShape(gert::InferShapeContext* context) +{ + const gert::Shape* x1_shape = context->GetInputShape(0); + gert::Shape* y_shape = context->GetOutputShape(0); + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} +} + +namespace ops { +class SelectV2 : public OpDef { +public: + explicit SelectV2(const char* name) : OpDef(name) + { + this->Input("condition") + .ParamType(REQUIRED) + .DataType({ge::DT_BOOL, ge::DT_BOOL, ge::DT_BOOL, ge::DT_BOOL}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); + this->Input("x1") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT8}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); + this->Input("x2") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT8}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); + this->Output("y") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT8}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); + + this->SetInferShape(ge::InferShape); + this->AICore() + .SetTiling(optiling::TilingFunc); + this->AICore().AddConfig("ascend310b"); + } +}; +OP_ADD(SelectV2); +} diff --git a/operator_contrib/SelectV2Sample/op_host/select_v2_tiling.h b/operator_contrib/SelectV2Sample/op_host/select_v2_tiling.h new file mode 100644 index 000000000..bfe190188 --- /dev/null +++ b/operator_contrib/SelectV2Sample/op_host/select_v2_tiling.h @@ -0,0 +1,41 @@ +#ifndef SELECTV2_TILING_H +#define SELECTV2_TILING_H +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "register/tilingdata_base.h" +namespace optiling { +BEGIN_TILING_DATA_DEF(SelectV2TilingData) + TILING_DATA_FIELD_DEF(int32_t, M); + TILING_DATA_FIELD_DEF(int32_t, N); + TILING_DATA_FIELD_DEF(int32_t, Z); + TILING_DATA_FIELD_DEF(uint32_t, totalLength); + TILING_DATA_FIELD_DEF(uint32_t, tileNum); + TILING_DATA_FIELD_DEF(uint32_t, ALIGN_NUM); + TILING_DATA_FIELD_DEF(uint32_t, tiling_size); + TILING_DATA_FIELD_DEF(uint32_t, block_size); + TILING_DATA_FIELD_DEF(uint32_t, aivNum); + TILING_DATA_FIELD_DEF(uint32_t, core_size); + TILING_DATA_FIELD_DEF(uint32_t, core_remain); + TILING_DATA_FIELD_DEF(int32_t, y_dimensional); + TILING_DATA_FIELD_DEF_ARR(int32_t, 20, y_ndarray); + TILING_DATA_FIELD_DEF_ARR(int32_t, 20, cdt_ndarray); + TILING_DATA_FIELD_DEF_ARR(int32_t, 20, x1_ndarray); + TILING_DATA_FIELD_DEF_ARR(int32_t, 20, x2_ndarray); + TILING_DATA_FIELD_DEF_ARR(int32_t, 20, y_sumndarray); + TILING_DATA_FIELD_DEF_ARR(int32_t, 20, cdt_sumndarray); + TILING_DATA_FIELD_DEF_ARR(int32_t, 20, x1_sumndarray); + TILING_DATA_FIELD_DEF_ARR(int32_t, 20, x2_sumndarray); + +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(SelectV2, SelectV2TilingData) +} + +#endif \ No newline at end of file diff --git a/operator_contrib/SelectV2Sample/op_kernel/select_v2.cpp b/operator_contrib/SelectV2Sample/op_kernel/select_v2.cpp new file mode 100644 index 000000000..444673f1b --- /dev/null +++ b/operator_contrib/SelectV2Sample/op_kernel/select_v2.cpp @@ -0,0 +1,239 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#define K_MAX_SHAPE_DIM 0 +#include +#include "kernel_operator.h" + +using namespace AscendC; +constexpr int32_t BUFFER_NUM = 2; +template class KernelSelectV2 { + public: + __aicore__ inline KernelSelectV2() {} + __aicore__ inline void Init(GM_ADDR condition, GM_ADDR x1, GM_ADDR x2, GM_ADDR y,int32_t M, int32_t N, int32_t Z, + uint32_t totalLength, uint32_t ALIGN_NUM, uint32_t block_size, uint32_t core_size, uint32_t core_remain) { + ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + this->blockLength = core_size + (GetBlockNum() == GetBlockIdx() + 1 ? core_remain : 0); + this->tileLength = block_size; + this->ALIGN_NUM = ALIGN_NUM; + if(ALIGN_NUM != 0){ + this->blockLength = this->blockLength + (this->blockLength % ALIGN_NUM ? ALIGN_NUM - this->blockLength % ALIGN_NUM : 0); + } + this->totalLength = totalLength; + auto startPointer = core_size * GetBlockIdx(); + auto bufferlength = this->blockLength; + this->M = M; //行 + this->N = N; //列 + this->Z = Z; //循环次数 + Gm_x1.SetGlobalBuffer((__gm__ TYPE_X1*)x1 ,bufferlength); + Gm_x2.SetGlobalBuffer((__gm__ TYPE_X1*)x2, bufferlength); + Gm_cdt.SetGlobalBuffer((__gm__ uint8_t *)condition, bufferlength); + Gm_y.SetGlobalBuffer((__gm__ TYPE_X1*)y, bufferlength); + if(this->tileLength != 0){ + this->tileNum = this->blockLength / this->tileLength + (this->blockLength % this->tileLength > 0); + } + pipe.InitBuffer(Q_x1, BUFFER_NUM, this->tileLength * sizeof(TYPE_X1)); + pipe.InitBuffer(Q_x2, BUFFER_NUM, this->tileLength * sizeof(TYPE_X1)); + pipe.InitBuffer(Q_cdt, BUFFER_NUM, this->tileLength * sizeof(uint8_t)); + pipe.InitBuffer(Q_y, BUFFER_NUM, this->tileLength * sizeof(TYPE_X1)); + pipe.InitBuffer(B_bits, this->tileLength * sizeof(uint8_t)); + pipe.InitBuffer(B_half, this->tileLength * sizeof(half)); + pipe.InitBuffer(B_zero, this->tileLength * sizeof(half));//取一个全为0的tensor作为处理 + this->zero = B_zero.Get(); + Duplicate(this->zero, half(0), this->tileLength); + if constexpr (std::is_same_v) { + pipe.InitBuffer(B_x1, this->tileLength * sizeof(float)); + pipe.InitBuffer(B_x2, this->tileLength * sizeof(float)); + pipe.InitBuffer(B_y, this->tileLength * sizeof(float)); + } + else if constexpr (std::is_same_v) { + pipe.InitBuffer(B_x1, this->tileLength * sizeof(half)); + pipe.InitBuffer(B_x2, this->tileLength * sizeof(half)); + pipe.InitBuffer(B_y, this->tileLength * sizeof(half)); + } + } + __aicore__ inline void Process() { + int32_t loopCount = this->tileNum; + for (int32_t i = 0; i < loopCount-1; i++) { + CopyIn(i, this->tileLength); + Compute(i, this->tileLength); + CopyOut(i, this->tileLength); + } + uint32_t length = this->blockLength - this->tileLength * (loopCount - 1); + CopyIn(loopCount - 1, (length +31 ) / 32 * 32); + Compute(loopCount - 1, (length +31 ) / 32 * 32); + CopyOut(loopCount - 1, (length +31 ) / 32 * 32); + } + private: + __aicore__ inline void CopyIn(int32_t progress, uint32_t length) { + LocalTensor x1 = Q_x1.AllocTensor(); + LocalTensor x2 = Q_x2.AllocTensor(); + LocalTensor cdt = Q_cdt.AllocTensor(); + DataCopy(x1, Gm_x1[progress * this->tileLength], length); + DataCopy(x2, Gm_x2[progress * this->tileLength], length); + DataCopy(cdt, (Gm_cdt[progress * this->tileLength]), length); + Q_x1.EnQue(x1); + Q_x2.EnQue(x2); + Q_cdt.EnQue(cdt); + } + __aicore__ inline void Compute(int32_t progress, uint32_t length) { + LocalTensor x1 = Q_x1.DeQue(); + LocalTensor x2 = Q_x2.DeQue(); + LocalTensor cdt = Q_cdt.DeQue(); + LocalTensor y = Q_y.AllocTensor(); + auto bits = B_bits.Get(); + if constexpr (std::is_same_v) { + auto p1 = B_half.Get(); + Cast(p1, cdt, RoundMode::CAST_NONE, length); + Compare(bits, p1,zero,CMPMODE::NE,length); + Select(y, bits, x1, x2, SELMODE::VSEL_TENSOR_TENSOR_MODE, length); + } + else if constexpr (std::is_same_v) { + auto p1 = B_half.Get(); + Cast(p1, cdt, RoundMode::CAST_NONE, length); + Compare(bits, p1,zero,CMPMODE::NE,length); + Select(y, bits, x1, x2, SELMODE::VSEL_TENSOR_TENSOR_MODE, length); + } + else if constexpr (std::is_same_v) { + auto p1 = B_half.Get(); + Cast(p1, cdt, RoundMode::CAST_NONE, length); + Compare(bits, p1,zero,CMPMODE::NE,length); + auto half_x1 = B_x1.Get(); + auto half_x2 = B_x2.Get(); + auto half_y = B_y.Get(); + Cast(half_x1, x1, RoundMode::CAST_NONE, length); + Cast(half_x2, x2, RoundMode::CAST_NONE, length); + Select(half_y, bits, half_x1, half_x2, SELMODE::VSEL_TENSOR_TENSOR_MODE, length); + Cast(y, half_y, RoundMode::CAST_NONE, length); + } + else{ + auto p1 = B_half.Get(); + Cast(p1, cdt, RoundMode::CAST_NONE, length); + Compare(bits, p1,zero,CMPMODE::NE,length); + auto float_x1 = B_x1.Get(); + auto float_x2 = B_x2.Get(); + auto float_y = B_y.Get(); + Cast(float_x1, x1, RoundMode::CAST_NONE, length); + Cast(float_x2, x2, RoundMode::CAST_NONE, length); + Select(float_y, bits, float_x1, float_x2, SELMODE::VSEL_TENSOR_TENSOR_MODE, length); + Cast(y, float_y, RoundMode::CAST_RINT, length); + } + Q_x1.FreeTensor(x1); + Q_x2.FreeTensor(x2); + Q_cdt.FreeTensor(cdt); + Q_y.EnQue(y); + } + __aicore__ inline void CopyOut(int32_t progress, uint32_t length) { + LocalTensor y = Q_y.DeQue(); + DataCopy(Gm_y[progress * this->tileLength], y, length); + Q_y.FreeTensor(y); + } + private: + TPipe pipe; + TQue Q_x1, Q_x2, Q_cdt; + TQue Q_y; + TBuf B_result, B_zero, B_bits,B_half; + TBuf B_x1, B_x2, B_y; + int32_t M; + int32_t N; + int32_t Z; + LocalTensor zero; + GlobalTensor Gm_x1, Gm_x2, Gm_y; + GlobalTensor Gm_cdt; + uint32_t blockLength; + uint32_t tileNum; + uint32_t totalLength; + uint32_t tileLength; + uint32_t ALIGN_NUM; + }; +template class KernelSelectV2_Broadcast { + using T = TYPE_X1; +public: + __aicore__ inline KernelSelectV2_Broadcast() {} + __aicore__ inline void Init(GM_ADDR condition, GM_ADDR x1, GM_ADDR x2, GM_ADDR y,int32_t M, int32_t N, int32_t Z, + int32_t y_dimensional, + int32_t* y_ndarray, int32_t* cdt_ndarray, int32_t* x1_ndarray, int32_t* x2_ndarray, + int32_t* y_sumndarray, int32_t* cdt_sumndarray, int32_t* x1_sumndarray, int32_t* x2_sumndarray) { + ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + this->M = M; //行 + this->N = N; //列 + this->Z = Z; //循环次数 + this->y_dimensional = y_dimensional; + this->y_ndarray = y_ndarray; + this->cdt_ndarray = cdt_ndarray; + this->x1_ndarray = x1_ndarray; + this->x2_ndarray = x2_ndarray; + this->y_sumndarray = y_sumndarray; + this->cdt_sumndarray = cdt_sumndarray; + this->x1_sumndarray = x1_sumndarray; + this->x2_sumndarray = x2_sumndarray; + Gm_x1.SetGlobalBuffer((__gm__ TYPE_X1*)x1 , 1); + Gm_x2.SetGlobalBuffer((__gm__ TYPE_X1*)x2 , 1); + Gm_cdt.SetGlobalBuffer((__gm__ TYPE_CONDITION*)condition , 1); + Gm_y.SetGlobalBuffer((__gm__ TYPE_X1*)y , 1); + } + __aicore__ inline void Process() { + int dim = this->y_dimensional; + for(int j=0; jy_sumndarray[dim]; j++) + { + int cdt_start = 0, x1_start = 0, x2_start = 0; + for(int k=0; kcdt_ndarray[k] != 1){ + cdt_start += this->cdt_sumndarray[k] * (j / this->y_sumndarray[k] % this->y_ndarray[k]); + } + if(this->x1_ndarray[k] != 1){ + x1_start += this->x1_sumndarray[k] * (j / this->y_sumndarray[k] % this->y_ndarray[k]); + } + if(this->x2_ndarray[k] != 1){ + x2_start += this->x2_sumndarray[k] * (j / this->y_sumndarray[k] % this->y_ndarray[k]); + } + } + TYPE_X1 pos = Gm_cdt.GetValue(cdt_start) ? Gm_x1.GetValue(x1_start) : Gm_x2.GetValue(x2_start); + Gm_y.SetValue(j,pos); + } + } +private: + TPipe pipe; + TQue inQueueSTART, inQueueEND, inQueueWEIGHT; + TQue outQueueY; + TBuf tmp1, tmp2, tmp3,boox; + GlobalTensor Gm_x1, Gm_x2, Gm_y; + GlobalTensor Gm_cdt; + int32_t M; + int32_t N; + int32_t Z; + int32_t y_dimensional; + int32_t *y_ndarray; + int32_t *cdt_ndarray; + int32_t *x1_ndarray; + int32_t *x2_ndarray; + int32_t *y_sumndarray; + int32_t *cdt_sumndarray; + int32_t *x1_sumndarray; + int32_t *x2_sumndarray; +}; + + extern "C" __global__ __aicore__ void select_v2(GM_ADDR condition, GM_ADDR x1, GM_ADDR x2, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) { + GET_TILING_DATA(tiling_data, tiling); + if(TILING_KEY_IS(1)){ + KernelSelectV2 op; + op.Init(condition, x1, x2, y, tiling_data.M, tiling_data.N, tiling_data.Z, + tiling_data.totalLength, tiling_data.ALIGN_NUM, tiling_data.block_size, tiling_data.core_size, tiling_data.core_remain); + op.Process(); + } + else if(TILING_KEY_IS(2)){ + KernelSelectV2_Broadcast op; + op.Init(condition, x1, x2, y, tiling_data.M, tiling_data.N, tiling_data.Z, + tiling_data.y_dimensional, + tiling_data.y_ndarray, tiling_data.cdt_ndarray, tiling_data.x1_ndarray, tiling_data.x2_ndarray, + tiling_data.y_sumndarray, tiling_data.cdt_sumndarray, tiling_data.x1_sumndarray, tiling_data.x2_sumndarray); + op.Process(); + } + } \ No newline at end of file -- Gitee From e537cd2c527105e5a26c30f20f9513810b011397 Mon Sep 17 00:00:00 2001 From: hid20190403 Date: Fri, 4 Jul 2025 14:59:40 +0000 Subject: [PATCH 2/6] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20opp=5Fkernel=5Faicpu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator_contrib/SelectV2Sample/opp_kernel_aicpu/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 operator_contrib/SelectV2Sample/opp_kernel_aicpu/.keep diff --git a/operator_contrib/SelectV2Sample/opp_kernel_aicpu/.keep b/operator_contrib/SelectV2Sample/opp_kernel_aicpu/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 51919b10507d7cffaeb8dadb0068c31b5ced3986 Mon Sep 17 00:00:00 2001 From: hid20190403 Date: Fri, 4 Jul 2025 15:00:08 +0000 Subject: [PATCH 3/6] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator_contrib/SelectV2Sample/tests/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 operator_contrib/SelectV2Sample/tests/.keep diff --git a/operator_contrib/SelectV2Sample/tests/.keep b/operator_contrib/SelectV2Sample/tests/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 2da40754c05cbdbf4d93b3d63b13e12b1f76c4d7 Mon Sep 17 00:00:00 2001 From: hid20190403 Date: Fri, 4 Jul 2025 15:00:38 +0000 Subject: [PATCH 4/6] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20st?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator_contrib/SelectV2Sample/tests/st/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 operator_contrib/SelectV2Sample/tests/st/.keep diff --git a/operator_contrib/SelectV2Sample/tests/st/.keep b/operator_contrib/SelectV2Sample/tests/st/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 73c56bbdebffdd72190a592810f9fe959f997cf7 Mon Sep 17 00:00:00 2001 From: hid20190403 Date: Fri, 4 Jul 2025 15:00:53 +0000 Subject: [PATCH 5/6] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20ut?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator_contrib/SelectV2Sample/tests/ut/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 operator_contrib/SelectV2Sample/tests/ut/.keep diff --git a/operator_contrib/SelectV2Sample/tests/ut/.keep b/operator_contrib/SelectV2Sample/tests/ut/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 7d4e015eb08b20726c03ca68314595f74e6ad45b Mon Sep 17 00:00:00 2001 From: hid20190403 Date: Fri, 4 Jul 2025 15:03:14 +0000 Subject: [PATCH 6/6] update operator_contrib/README.md. Signed-off-by: hid20190403 --- operator_contrib/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/operator_contrib/README.md b/operator_contrib/README.md index da799262c..dbff6a089 100644 --- a/operator_contrib/README.md +++ b/operator_contrib/README.md @@ -32,6 +32,7 @@ | [ReduceSumSample](./ReduceSumSample) | 基于Ascend C的ReduceSum自定义Vector算子及调用样例 | Atlas 200/500 A2 推理产品
Atlas A2训练系列产品/Atlas 800I A2推理产品 | 8.0.RC2.alpha003 | | [ScatterMaxSample](./ScatterMaxSample) | 基于Ascend C的ScatterMax自定义Vector算子及调用样例 | Atlas 200/500 A2 推理产品
Atlas A2训练系列产品/Atlas 800I A2推理产品 | 8.0.RC1.alpha003 | | [ScatterSubSample](./ScatterSubSample) | 基于Ascend C的ScatterSub自定义Vector算子及调用样例 | Atlas 200/500 A2 推理产品
Atlas A2训练系列产品/Atlas 800I A2推理产品 | 8.0.RC1.alpha003 | +| [SelectV2Sample](./SelectV2Sample) | 基于Ascend C的SelectV2自定义Vector算子及调用样例 | Atlas 200/500 A2 推理产品
Atlas A2训练系列产品/Atlas 800I A2推理产品 | 8.0.RC1.alpha003 | | [SpenceSample](./SpenceSample) | 基于Ascend C的Spence自定义Vector算子及调用样例 | Atlas 200/500 A2 推理产品
Atlas A2训练系列产品/Atlas 800I A2推理产品 | 8.0.RC1.alpha003 | | [ThreeNNSample](./ThreeNNSample) | 基于Ascend C的ThreeNN自定义Vector算子及调用样例 | Atlas 200/500 A2 推理产品
Atlas A2训练系列产品/Atlas 800I A2推理产品 | 8.0.RC2.alpha003 | | [TrilSample](./TrilSample) | 基于Ascend C的Tril自定义Vector算子及调用样例 | Atlas 200/500 A2 推理产品
Atlas A2训练系列产品/Atlas 800I A2推理产品 | 8.0.RC2.alpha003 | -- Gitee