diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5c1ffb4d2e14037494b93a083080227737251db9 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md @@ -0,0 +1,75 @@ +## 目录结构介绍 + +``` +├── AclNNInvocation //通过单算子API调用的方式调用AddsCustom算子 +│ ├── inc // 头文件目录 +│ │ ├── common.h // 声明公共方法类,用于读取二进制文件 +│ │ ├── op_runner.h // 算子描述声明文件,包含算子输入/输出,算子类型以及输入描述与输出描述 +│ │ └── operator_desc.h // 算子运行相关信息声明文件,包含算子输入/输出个数,输入/输出大小等 +│ ├── input // 存放脚本生成的输入数据目录 +│ ├── scripts +│ │ ├── acl.json // acl配置文件 +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ │ └── verify_result.py // 精度校验脚本 +│ ├── src +│ │ ├── CMakeLists.txt // 编译规则文件 +│ │ ├── common.cpp // 公共函数,读取二进制文件函数的实现文件 +│ │ ├── main.cpp // 单算子调用应用的入口 +│ │ ├── op_runner.cpp // 单算子调用主体流程实现文件 +│ │ └── operator_desc.cpp // 构造算子的输入与输出描述 +│ └── run.sh // 执行命令脚本 +``` + +## 代码实现介绍 + +完成自定义算子的开发部署后,可以通过单算子调用的方式来验证单算子的功能。src/main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子,无需提供单算子描述文件进行离线模型的转换,直接调用单算子API接口。 + +自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: + +```cpp + // 获取算子使用的workspace空间大小 + aclnnStatus aclnnAddsCustomGetWorkspaceSize( + const aclTensor *x, + int64_t caseId, + const aclTensor *out, + uint64_t *workspaceSize, + aclOpExecutor **executor); + // 执行算子 + aclnnStatus aclnnAddsCustom( + void *workspace, + uint64_t workspaceSize, + aclOpExecutor *executor, + aclrtStream stream); +``` + +其中aclnnAddsCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddsCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 + +## 运行样例算子 + +### 1. 编译算子工程 + +运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 + +### 2. 单算子API调用样例运行 + +- 进入到样例目录 + + 以命令行方式下载样例代码,master分支为例。 + + ```bash + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation + ``` +- 样例执行 + + 样例执行过程中会自动生成测试数据,然后编译与运行单算子API调用样例,最后检验运行结果。具体过程可参见run.sh脚本。 + + ```bash + bash run.sh + ``` + +## 更新说明 + + +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/07/03 | 新增本readme | diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h new file mode 100644 index 0000000000000000000000000000000000000000..fadb5c80868bd563952c2080171859de0ab3120c --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h @@ -0,0 +1,45 @@ +/** + * @file common.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size); + +#endif // COMMON_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h new file mode 100644 index 0000000000000000000000000000000000000000..7b98d5730b5d14d0ee2b2bd8eb5fd1e7e06ca2ec --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h @@ -0,0 +1,188 @@ +/** + * @file op_runner.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "acl/acl.h" +#include "aclnn/acl_meta.h" +#include "common.h" +#include "operator_desc.h" + +/** + * Op Runner + */ +class OpRunner { +public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + const size_t GetInputNumDims(size_t index) const; + aclDataType GetInputDataType(size_t index) const; + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + const size_t GetOutputNumDims(size_t index) const; + aclDataType GetOutputDataType(size_t index) const; + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(int64_t caseId); + + /** + * @brief Get case index + * @return case index by user input + */ + int64_t GetCaseId(); + +private: + size_t numInputs_; + size_t numOutputs_; + void *workspace_; + int64_t caseId_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; +}; + +#endif // OP_RUNNER_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..cf02d7cecc6fcdc05fdc9e62f7005d06345476e4 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h @@ -0,0 +1,57 @@ +/** + * @file operator_desc.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +/** + * Op description + */ +struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; +}; + +#endif // OPERATOR_DESC_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/input/.keep b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/input/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..827ea801fd7f653273ad639164413cad6ff3e4ab --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh @@ -0,0 +1,78 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export DDK_PATH=$_ASCEND_INSTALL_PATH +export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib + +function main { + # 1. 清除遗留生成文件和日志文件 + rm -rf $HOME/ascend/log/* + rm -f ./input/*.bin + rm -f ./output && mkdir -p ./output + + # 2. 生成输入数据和真值数据 + cd $CURRENT_DIR + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + # 3. 编译可执行文件 + cd $CURRENT_DIR + rm -rf build + mkdir -p build + cd build + cmake ../src -DCMAKE_SKIP_RPATH=TRUE + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + # 4. 运行可执行文件 + export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH + export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} + cd $CURRENT_DIR/output + echo "INFO: execute op!" + msprof op --launch-count=3 --output=./prof ./execute_adds_op + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + + # 5. 精度比对 + cd $CURRENT_DIR + python3 scripts/verify_result.py output/output_z_1.bin output/golden.bin + python3 scripts/verify_result.py output/output_z_2.bin output/golden.bin + python3 scripts/verify_result.py output/output_z_3.bin output/golden.bin + if [ $? -ne 0 ]; then + echo "ERROR: verify result failed!" + return 1 + fi +} + +main diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4ecbe6e91257a492735bb60d3d0dfaaf9486e3 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py @@ -0,0 +1,23 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np + + +def gen_golden_data_simple(): + input_x = np.random.uniform(1, 100, [8192, 128]).astype(np.float32) + golden = (input_x + 2.0).astype(np.float32) + + input_x.tofile("./input/input_x.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py new file mode 100644 index 0000000000000000000000000000000000000000..a5019f30fdf1e34188f6f777e5ef3e4aad3491c2 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float32 +relative_tol = 1e-4 +absolute_tol = 1e-5 +error_tol = 1e-4 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d0ae1bd386dfe128588617ad7502dbc8854ee41 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt @@ -0,0 +1,65 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_adds) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ../inc + ${INC_PATH}/include + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_adds_op + operator_desc.cpp + op_runner.cpp + main.cpp + common.cpp +) + +target_link_libraries(execute_adds_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_adds_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d58716122d1defa71729cd257b878e7056ae8d14 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp @@ -0,0 +1,80 @@ +/** + * @file common.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "common.h" + +#include +#include +#include + +#include + +extern bool g_isDevice; + +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b70950642e93b6ff70111d00fbef9b80af34ebb9 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp @@ -0,0 +1,163 @@ +/** + * @file main.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "common.h" +#include "op_runner.h" + +bool g_isDevice = false; +int deviceId = 0; + +OperatorDesc CreateOpDesc() +{ + // define operator + std::vector shape{8192, 128}; + aclDataType dataType = ACL_FLOAT; + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(dataType, shape.size(), shape.data(), format); + opDesc.AddOutputTensorDesc(dataType, shape.size(), shape.data(), format); + return opDesc; +} + +bool SetInputData(OpRunner &runner) +{ + size_t fileSize = 0; + ReadFile("../input/input_x.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0)); + INFO_LOG("Set input success"); + return true; +} + +bool ProcessOutputData(OpRunner &runner) +{ + int64_t caseId = runner.GetCaseId(); + WriteFile("../output/output_z_" + std::to_string(caseId) + ".bin", runner.GetOutputBuffer(0), + runner.GetOutputSize(0)); + INFO_LOG("Write output success"); + return true; +} + +void DestroyResource() +{ + bool flag = false; + if (aclrtResetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destroy resource failed"); + } else { + INFO_LOG("Destroy resource success"); + } +} + +bool InitResource() +{ + std::string output = "../output"; + + // acl.json is dump or profiling config file + if (aclInit("../scripts/acl.json") != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + if (aclrtSetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. deviceId is %d", deviceId); + (void)aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestroyResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; +} + +bool RunOp(int64_t caseId) +{ + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp(caseId)) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + + INFO_LOG("Run op success"); + return true; +} + +int main(int argc, char **argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + int64_t caseId = 1; + if (!RunOp(caseId)) { + DestroyResource(); + return FAILED; + } + + caseId = 2; + if (!RunOp(caseId)) { + DestroyResource(); + return FAILED; + } + + caseId = 3; + if (!RunOp(caseId)) { + DestroyResource(); + return FAILED; + } + + DestroyResource(); + return SUCCESS; +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d7bde46d655dc0be0e5d2c042ae93038ffca072b --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp @@ -0,0 +1,462 @@ +/** + * @file op_runner.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "op_runner.h" + +#include +#include + +#include "acl/acl_op_compiler.h" +#include "aclnn_adds_custom.h" +#include "common.h" + +using namespace std; + +extern bool g_isDevice; + +OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); + workspace_ = nullptr; +} + +OpRunner::~OpRunner() +{ + if (workspace_ != nullptr) { + (void)aclrtFree(workspace_); + } + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); + } + } + + for (size_t i = 0; i < numOutputs_; ++i) { + (void)aclDestroyTensor(outputTensor_[i]); + (void)aclDestroyDataBuffer(outputBuffers_[i]); + (void)aclrtFree(devOutputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); + } + } +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor *inputTensor = + aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0, + GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + devOutputs_.emplace_back(devMem); + outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + + aclTensor *outputTensor = + aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), nullptr, 0, + GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]); + if (outputTensor == nullptr) { + ERROR_LOG("Create Tensor for output[%zu] failed", i); + return false; + } + outputTensor_.emplace_back(outputTensor); + } + + return true; +} + +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} + +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} + +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} + +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} + +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp(int64_t caseId) +{ + caseId_ = caseId; + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor *handle = nullptr; + auto ret = aclnnAddsCustomGetWorkspaceSize(inputTensor_[0], caseId, outputTensor_[0], &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddsCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); + + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace_, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = aclnnAddsCustom(workspace_, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddsCustom success"); + + // The unit of 5000 is ms. + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void)aclrtDestroyStream(stream); + return true; +} + +int64_t OpRunner::GetCaseId() +{ + return caseId_; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} + +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} + +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..90e0ac343405dfabff6c53b178190403fc8287d5 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp @@ -0,0 +1,51 @@ +/** + * @file operator_desc.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "operator_desc.h" + +#include "common.h" + +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto *desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + + for (auto *desc : outputDesc) { + aclDestroyTensorDesc(desc); + } +} + +OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + + outputDesc.emplace_back(desc); + return *this; +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json new file mode 100644 index 0000000000000000000000000000000000000000..a54432512d5c2aebe363db34bca0d0ad9d106b86 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json @@ -0,0 +1,37 @@ +[ + { + "op": "AddsCustom", + "language": "cpp", + "input_desc": [ + { + "name": "x", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ], + "output_desc": [ + { + "name": "z", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ], + "attr": [ + { + "name": "case_id", + "type": "int", + "value": 1 + } + ] + } +] \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c91c15b50b0f98dce7b4394cea24f1f6c888f4a --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -0,0 +1,56 @@ +/** + * @file adds_custom.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "../op_kernel/adds_custom_tiling.h" +#include "register/op_def_registry.h" + +namespace optiling { +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + constexpr uint32_t BLOCK_DIM = 16; + context->SetBlockDim(BLOCK_DIM); + + // set tiling_key + auto attrs = context->GetAttrs(); + const int64_t *caseId = attrs->GetInt(0); + context->SetTilingKey(*caseId); + + AddsCustomTilingData *tiling = context->GetTilingData(); + constexpr uint32_t M = 8192; + constexpr uint32_t N = 128; + constexpr uint32_t TILE_M = 512; + constexpr uint32_t TILE_N = 8; + constexpr uint32_t LOOP_ONE_CORE = M / TILE_M; + tiling->m = M; + tiling->n = N; + tiling->tileM = TILE_M; + tiling->tileN = TILE_N; + tiling->loopOneCore = LOOP_ONE_CORE; + + // set workspace size + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ops { +class AddsCustom : public OpDef { +public: + explicit AddsCustom(const char *name) : OpDef(name) + { + this->Input("x").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + this->Output("z").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b"); + this->Attr("case_id").Int(1); + } +}; +OP_ADD(AddsCustom); +} // namespace ops diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8d0ad4cd982dbcded1e15395f0a2b7736e476f2c --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp @@ -0,0 +1,33 @@ +/** + * @file adds_custom.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "adds_custom_v1.h" +#include "adds_custom_v2.h" +#include "adds_custom_v3.h" + +extern "C" __global__ __aicore__ void adds_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) +{ + REGISTER_TILING_DEFAULT(AddsCustomTilingData); + GET_TILING_DATA(tilingData, tiling); + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); + if (TILING_KEY_IS(1UL)) { + KernelAddsV1 op; + op.Init(x, z, &tilingData); + op.Process(); + } else if (TILING_KEY_IS(2UL)) { + KernelAddsV2 op; + op.Init(x, z, &tilingData); + op.Process(); + } else if (TILING_KEY_IS(3UL)) { + KernelAddsV3 op; + op.Init(x, z, &tilingData); + op.Process(); + } +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..8730ae52805e273da6f4b8d844655071c84c4bac --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h @@ -0,0 +1,22 @@ +/** + * @file adds_custom_tiling.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADDS_CUSTOM_TILING_H +#define ADDS_CUSTOM_TILING_H +#include + +class AddsCustomTilingData { +public: + uint32_t m; + uint32_t n; + uint32_t tileM; + uint32_t tileN; + uint32_t loopOneCore; +}; +#endif // ADDS_CUSTOM_TILING_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h new file mode 100644 index 0000000000000000000000000000000000000000..70d86c001325093ff36425a6b8f539b9e7fc6951 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h @@ -0,0 +1,88 @@ +/** + * @file adds_custom_v1.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADDS_CUSTOM_V1_H +#define ADDS_CUSTOM_V1_H +#include "kernel_operator.h" +#include "adds_custom_tiling.h" + +using AscendC::TPosition; +class KernelAddsV1 { +public: + __aicore__ inline KernelAddsV1() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData *tilingPtr) + { + tiling = tilingPtr; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileN); + // the gm address conflict happens when multi cores visit the same addr range(512Bytes) + // we disable the L2 cache mode to highlight the influence of the gm address conflict + xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int32_t i = 0; i < tiling->loopOneCore; i++) { + // the following two SyncAll in this case are unnecessary actually, + // we just used them to highlight the influence of gm address conflict in each loop + AscendC::SyncAll(); + CopyIn(i); + Compute(); + AscendC::SyncAll(); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopyParams params; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + params.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progress * tiling->tileM * tiling->n], params); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + constexpr float scale = 2.0; + AscendC::Adds(zLocal, xLocal, scale, tiling->tileM * tiling->tileN); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams params; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = 0; + params.dstStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * tiling->tileM * tiling->n], zLocal, params); + outQueueZ.FreeTensor(zLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor zGm; + AddsCustomTilingData *tiling; +}; +#endif // ADDS_CUSTOM_V1_H \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h new file mode 100644 index 0000000000000000000000000000000000000000..ae5314a9039e163de55bc4251b9e2474800c1a62 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h @@ -0,0 +1,94 @@ +/** + * @file adds_custom_v2.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADDS_CUSTOM_V2_H +#define ADDS_CUSTOM_V2_H +#include "kernel_operator.h" +#include "adds_custom_tiling.h" + +using AscendC::TPosition; +class KernelAddsV2 { +public: + __aicore__ inline KernelAddsV2() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData *tilingPtr) + { + tiling = tilingPtr; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileN); + // the gm address conflict happens when multi cores visit the same addr range(512Bytes) + // we disable the L2 cache mode to highlight the influence of the gm address conflict + xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int32_t i = 0; i < tiling->loopOneCore; i++) { + // adjust the loop order to avoid the gm address conflict: + // the loop order of core0 : 0, 1, 2, 3, ..., 13, 14, 15 + // the loop order of core1 : 1, 2, 3, 4, ..., 14, 15, 0 + // ... + // the loop order of core15 : 15, 0, 1, 2, ..., 12, 13, 14 + int32_t newProgress = (i + AscendC::GetBlockIdx()) % tiling->loopOneCore; + // the following two SyncAll in this case are unnecessary actually, + // we just used them to highlight the influence of gm address conflict in each loop + AscendC::SyncAll(); + CopyIn(newProgress); + Compute(); + AscendC::SyncAll(); + CopyOut(newProgress); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopyParams params; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + params.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progress * tiling->tileM * tiling->n], params); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + constexpr float scale = 2.0; + AscendC::Adds(zLocal, xLocal, scale, tiling->tileM * tiling->tileN); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams params; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = 0; + params.dstStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * tiling->tileM * tiling->n], zLocal, params); + outQueueZ.FreeTensor(zLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor zGm; + AddsCustomTilingData *tiling; +}; +#endif // ADDS_CUSTOM_V2_H \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h new file mode 100644 index 0000000000000000000000000000000000000000..caecdef5e8d0103a85c688c616ec0166d0fa90e6 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h @@ -0,0 +1,89 @@ +/** + * @file adds_custom_v3.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADDS_CUSTOM_V3_H +#define ADDS_CUSTOM_V3_H +#include "kernel_operator.h" +#include "adds_custom_tiling.h" + +using AscendC::TPosition; +class KernelAddsV3 { +public: + __aicore__ inline KernelAddsV3() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData *tilingPtr) + { + tiling = tilingPtr; + // change the tile method from column split to row split + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileM * tiling->n); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileM * tiling->n); + // the gm address conflict happens when multi cores visit the same addr range(512Bytes) + // we disable the L2 cache mode to highlight the influence of the gm address conflict + xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int32_t i = 0; i < tiling->loopOneCore; i++) { + // the following two SyncAll in this case are unnecessary actually, + // we just used them to highlight the influence of gm address conflict in each loop + AscendC::SyncAll(); + CopyIn(i); + Compute(); + AscendC::SyncAll(); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopyParams params; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + params.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progress * tiling->tileN], params); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + constexpr float scale = 2.0; + AscendC::Adds(zLocal, xLocal, scale, tiling->tileM * tiling->tileN); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams params; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = 0; + params.dstStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * tiling->tileN], zLocal, params); + outQueueZ.FreeTensor(zLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor zGm; + AddsCustomTilingData *tiling; +}; +#endif // ADDS_CUSTOM_V3_H \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index 34c96391e766ab9fc59131b5b86a9abf5af1967f..03fdd24fc60abb479f43f9409677221f14390e45 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -1 +1,161 @@ -MATA地址冲突(待补充) \ No newline at end of file +## 概述 + +本样例基于AddsCustom算子工程,介绍了同地址冲突的影响以及两种解决方法。 + +## 目录结构介绍 + +``` +├── 15_mata_address_conflict // 同地址冲突样例工程目录 +│ ├── AclNNInvocation // 通过单算子API调用的方式调用AddsCustom算子 +│ ├── AddsCustom // AddsCustom算子工程 +│ ├── AddsCustom.json // AddsCustom算子的原型定义json文件 +│ └── install.sh // 脚本,调用msOpGen生成自定义算子工程,并编译 +``` + +## 算子描述 + +Adds算子实现了一个Tensor与标量值2.0相加,返回相加结果的功能。对应的数学表达式为: + +``` +z = x + 2.0 +``` + +本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在global memory的数据访问中,数据访问请求(读/写)在芯片内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,芯片会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 + +当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 + +## 算子规格描述 + + + + + + + + + + + +
算子类型(OpType)Adds
算子输入nameshapedata typeformat
x8192 * 128floatND
算子输出z8192 * 128floatND
核函数名adds_custom
+ +## 支持的产品型号 + +本样例支持如下产品型号: + +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 算子工程介绍 + +其中,算子工程目录AddsCustom包含算子的实现文件,如下所示: + +``` +├── AddsCustom // AddsCustom自定义算子工程 +│ ├── op_host // host侧实现文件 +│ └── op_kernel // kernel侧实现文件 +``` + +CANN软件包中提供了工程创建工具msOpGen,AddsCustom算子工程可通过AddsCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 + +创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在AddsCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 + +备注:CustomOp目录为生成目录,每次执行install.sh脚本都会删除该目录并重新生成,切勿在该目录下编码算子,会存在丢失风险。 + +## 编译运行样例算子 + +针对自定义算子工程,编译运行包含如下步骤: + +- 调用msOpGen工具生成自定义算子工程; +- 完成算子host和kernel实现; +- 编译自定义算子工程生成自定义算子包; +- 安装自定义算子包到自定义算子库中; +- 调用执行自定义算子; + +详细操作如下所示。 + +### 1. 获取源码包 + +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包。 + +### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 + +- 切换到msOpGen脚本install.sh所在目录 + + ```bash + # 若开发者以git命令行方式clone了master分支代码,并切换目录 + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict + ``` +- 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 + + - 方式一:配置环境变量运行脚本 + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量命令。 + - 默认路径,root用户安装CANN软件包 + + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + 运行install.sh脚本 + + ```bash + bash install.sh -v [SOC_VERSION] + ``` + - 方式二:指定命令行安装路径来运行脚本 + ```bash + bash install.sh -v [SOC_VERSION] -i [ASCEND_INSTALL_PATH] + ``` + + 参数说明: + + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - ASCEND_INSTALL_PATH:CANN软件包安装路径 + + 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 + + 备注:如果要使用dump调试功能,需要移除op_host内的Atlas 训练系列产品、Atlas 200/500 A2 推理产品的配置项。 + +### 3. 部署自定义算子包 + +- 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH + + ```bash + echo $ASCEND_OPP_PATH + # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp + + # 若没有,则需导出CANN环境变量 + source [ASCEND_INSTALL_PATH]/bin/setenv.bash + # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash + ``` + + 参数说明: + + - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 +- 在自定义算子安装包所在路径下,执行如下命令安装自定义算子包 + + ```bash + cd CustomOp/build_out + ./custom_opp__.run + ``` + + 命令执行成功后,自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。 + +### 4. 调用执行算子工程 + +- [单算子API调用AddsCustom算子工程](./AclNNInvocation/README.md) + +## 更新说明 + + +| 时间 | 更新事项 | +| ---------- | -------- | +| 2025/07/03 | 新增样例 | diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh new file mode 100755 index 0000000000000000000000000000000000000000..24a0c35a2c68371bd6ce85cf562b215eb27b07ff --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh @@ -0,0 +1,58 @@ +#!/bin/bash +SHORT=v:,i:, +LONG=soc-version:,install-path:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH + +OP_NAME=AddsCustom +# Generate the op framework +rm -rf CustomOp && msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp +# Copy op implementation files to CustomOp +rm -rf CustomOp/op_host/*.cpp +rm -rf CustomOp/op_kernel/*.h && rm -rf CustomOp/op_kernel/*.cpp +cp -rf $OP_NAME/op_kernel CustomOp/ +cp -rf $OP_NAME/op_host CustomOp/ + +# Build CustomOp project +(cd CustomOp && bash build.sh) \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/README.md b/operator/ascendc/4_best_practices/README.md index f5379bbbf0341c433229ff3717bede4e0f4c4ccc..c40fe61a7e48c83d552db737f1cd0bfdb573a880 100644 --- a/operator/ascendc/4_best_practices/README.md +++ b/operator/ascendc/4_best_practices/README.md @@ -8,6 +8,7 @@ | ------------------------------- | ------------------------------------------ | ------------------------------------------ | | [4_bank_conflict](./4_bank_conflict) | 基于Ascend C的bank冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [6_group_matmul](./6_group_matmul) | 基于Ascend C的group matmul算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | +| [15_mata_address_conflict](./15_mata_address_conflict) | 基于Ascend C的同地址冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [21_all_gather_matmul_custom](./21_all_gather_matmul_custom) | 基于Ascend C的AllGatherMatmul算子性能调优样例 | Atlas A2训练系列产品 | | [22_matmul_reduce_scatter_custom](./22_matmul_reduce_scatter_custom) | 基于Ascend C的MatmulReduceScatter算子性能调优样例 | Atlas A2训练系列产品 | | [23_matmul_all_reduce_custom](./23_matmul_all_reduce_custom) | 基于Ascend C的MatmulAllReduce算子性能调优样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |