diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md deleted file mode 100644 index ae31f00d758fcec8308ba9e10d2a5f7e3020a019..0000000000000000000000000000000000000000 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md +++ /dev/null @@ -1 +0,0 @@ -CACHE MISS优化 preload dcci(待补充) \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d3e63bedf9e192e46766121accfa6842555c6044 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md @@ -0,0 +1,76 @@ +## 目录结构介绍 + +``` +├── AclNNInvocation //通过单算子API调用的方式调用AddCustom算子 +│ ├── inc // 头文件目录 +│ │ ├── common.h // 声明公共方法类,用于读取二进制文件 +│ │ ├── op_runner.h // 算子描述声明文件,包含算子输入/输出,算子类型以及输入描述与输出描述 +│ │ └── operator_desc.h // 算子运行相关信息声明文件,包含算子输入/输出个数,输入/输出大小等 +│ ├── input // 存放脚本生成的输入数据目录 +│ ├── scripts +│ │ ├── acl.json // acl配置文件 +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ │ └── verify_result.py // 精度校验脚本 +│ ├── src +│ │ ├── CMakeLists.txt // 编译规则文件 +│ │ ├── common.cpp // 公共函数,读取二进制文件函数的实现文件 +│ │ ├── main.cpp // 单算子调用应用的入口 +│ │ ├── op_runner.cpp // 单算子调用主体流程实现文件 +│ │ └── operator_desc.cpp // 构造算子的输入与输出描述 +│ └── run.sh // 执行命令脚本 +``` + +## 代码实现介绍 + +完成自定义算子的开发部署后,可以通过单算子调用的方式来验证单算子的功能。src/main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子,无需提供单算子描述文件进行离线模型的转换,直接调用单算子API接口。 + +自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: + +```cpp +// 获取算子使用的workspace空间大小 +aclnnStatus aclnnAddCustomGetWorkspaceSize( + const aclTensor *x, + const aclTensor *y, + int64_t caseId, + const aclTensor *out, + uint64_t *workspaceSize, + aclOpExecutor **executor); +// 执行算子 +aclnnStatus aclnnAddCustom( + void *workspace, + uint64_t workspaceSize, + aclOpExecutor *executor, + aclrtStream stream); +``` + +其中aclnnAddCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 + +## 运行样例算子 + +### 1. 编译算子工程 + +运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 + +### 2. 单算子API调用样例运行 + +- 进入到样例目录 + + 以命令行方式下载样例代码,master分支为例。 + + ```bash + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation + ``` +- 样例执行 + + 样例执行过程中会自动生成测试数据,然后编译与运行单算子API调用样例,最后检验运行结果。具体过程可参见run.sh脚本。 + + ```bash + bash run.sh + ``` + +## 更新说明 + + +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/07/14 | 新增本readme | diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/common.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/common.h new file mode 100644 index 0000000000000000000000000000000000000000..fadb5c80868bd563952c2080171859de0ab3120c --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/common.h @@ -0,0 +1,45 @@ +/** + * @file common.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size); + +#endif // COMMON_H diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/op_runner.h new file mode 100644 index 0000000000000000000000000000000000000000..7b98d5730b5d14d0ee2b2bd8eb5fd1e7e06ca2ec --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/op_runner.h @@ -0,0 +1,188 @@ +/** + * @file op_runner.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "acl/acl.h" +#include "aclnn/acl_meta.h" +#include "common.h" +#include "operator_desc.h" + +/** + * Op Runner + */ +class OpRunner { +public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + const size_t GetInputNumDims(size_t index) const; + aclDataType GetInputDataType(size_t index) const; + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + const size_t GetOutputNumDims(size_t index) const; + aclDataType GetOutputDataType(size_t index) const; + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(int64_t caseId); + + /** + * @brief Get case index + * @return case index by user input + */ + int64_t GetCaseId(); + +private: + size_t numInputs_; + size_t numOutputs_; + void *workspace_; + int64_t caseId_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; +}; + +#endif // OP_RUNNER_H diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/operator_desc.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/operator_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..cf02d7cecc6fcdc05fdc9e62f7005d06345476e4 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/operator_desc.h @@ -0,0 +1,57 @@ +/** + * @file operator_desc.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +/** + * Op description + */ +struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; +}; + +#endif // OPERATOR_DESC_H diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..894fec61c4d01d3116221d42174d5c570ab51864 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/run.sh @@ -0,0 +1,77 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export DDK_PATH=$_ASCEND_INSTALL_PATH +export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib + +function main { + # 1. 清除遗留生成文件和日志文件 + rm -rf $HOME/ascend/log/* + rm -rf ./input && mkdir -p ./input + rm -rf ./output && mkdir -p ./output + + # 2. 生成输入数据和真值数据 + cd $CURRENT_DIR + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + # 3. 编译可执行文件 + cd $CURRENT_DIR + rm -rf build + mkdir -p build + cd build + cmake ../src -DCMAKE_SKIP_RPATH=TRUE + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + # 4. 运行可执行文件 + export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH + export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} + cd $CURRENT_DIR/output + echo "INFO: execute op!" + msprof --application="./execute_add_op" --ai-core=on --l2=on --output=./prof + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + + # 5. 精度比对 + cd $CURRENT_DIR + python3 scripts/verify_result.py output/output_z_1.bin output/golden.bin + python3 scripts/verify_result.py output/output_z_2.bin output/golden.bin + if [ $? -ne 0 ]; then + echo "ERROR: verify result failed!" + return 1 + fi +} + +main diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/acl.json b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/acl.json new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/acl.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..17b3d7119d6aa14a64464339e0b2b0457bf61a64 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/gen_data.py @@ -0,0 +1,28 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np + + +def gen_golden_data_simple(): + row = 5120 + col = 5120 + input_x = np.random.uniform(1, 10, [row, col]).astype(np.float32) + input_y = np.random.uniform(1, 10, [row, col * 3]).astype(np.float32) + y_blocks = np.split(input_y, 3, axis=1) + result_blocks = [input_x + block for block in y_blocks] + golden = np.hstack(result_blocks) + input_x.tofile("./input/input_x.bin") + input_y.tofile("./input/input_y.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/verify_result.py new file mode 100644 index 0000000000000000000000000000000000000000..a5019f30fdf1e34188f6f777e5ef3e4aad3491c2 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float32 +relative_tol = 1e-4 +absolute_tol = 1e-5 +error_tol = 1e-4 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..32bed518d88a80a72e7892f2901b7adfdb61eea1 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt @@ -0,0 +1,65 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_add) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ../inc + ${INC_PATH}/include + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_add_op + operator_desc.cpp + op_runner.cpp + main.cpp + common.cpp +) + +target_link_libraries(execute_add_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_add_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/common.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d58716122d1defa71729cd257b878e7056ae8d14 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/common.cpp @@ -0,0 +1,80 @@ +/** + * @file common.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "common.h" + +#include +#include +#include + +#include + +extern bool g_isDevice; + +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d727b0a291cdd672c684bfd8ead027c5de92a97f --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/main.cpp @@ -0,0 +1,163 @@ +/** + * @file main.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "common.h" +#include "op_runner.h" + +bool g_isDevice = false; +int deviceId = 0; + +OperatorDesc CreateOpDesc() +{ + // define operator + constexpr uint32_t ROW = 5120; + constexpr uint32_t COL = 5120; + std::vector shapeX{ROW, COL}; + std::vector shapeY{ROW, COL*3}; + std::vector shapeZ{ROW, COL*3}; + aclDataType dataType = ACL_FLOAT; + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(dataType, shapeX.size(), shapeX.data(), format); + opDesc.AddInputTensorDesc(dataType, shapeY.size(), shapeY.data(), format); + opDesc.AddOutputTensorDesc(dataType, shapeZ.size(), shapeZ.data(), format); + return opDesc; +} + +bool SetInputData(OpRunner &runner) +{ + size_t fileSize = 0; + ReadFile("../input/input_x.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0)); + ReadFile("../input/input_y.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1)); + INFO_LOG("Set input success"); + return true; +} + +bool ProcessOutputData(OpRunner &runner) +{ + int64_t caseId = runner.GetCaseId(); + WriteFile("../output/output_z_" + std::to_string(caseId) + ".bin", runner.GetOutputBuffer(0), + runner.GetOutputSize(0)); + INFO_LOG("Write output success"); + return true; +} + +void DestroyResource() +{ + bool flag = false; + if (aclrtResetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destroy resource failed"); + } else { + INFO_LOG("Destroy resource success"); + } +} + +bool InitResource() +{ + std::string output = "../output"; + + // acl.json is dump or profiling config file + if (aclInit("../scripts/acl.json") != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + if (aclrtSetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. deviceId is %d", deviceId); + (void)aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestroyResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; +} + +bool RunOp(int64_t caseId) +{ + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp(caseId)) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + + INFO_LOG("Run op success"); + return true; +} + +int main(int argc, char **argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + int64_t caseId = 1; + if (!RunOp(caseId)) { + DestroyResource(); + return FAILED; + } + + caseId = 2; + if (!RunOp(caseId)) { + DestroyResource(); + return FAILED; + } + + DestroyResource(); + return SUCCESS; +} diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/op_runner.cpp new file mode 100644 index 0000000000000000000000000000000000000000..36d197bc5ad9e6028a8068616a0f7eb2210dfc77 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/op_runner.cpp @@ -0,0 +1,462 @@ +/** + * @file op_runner.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "op_runner.h" + +#include +#include + +#include "acl/acl_op_compiler.h" +#include "aclnn_add_custom.h" +#include "common.h" + +using namespace std; + +extern bool g_isDevice; + +OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); + workspace_ = nullptr; +} + +OpRunner::~OpRunner() +{ + if (workspace_ != nullptr) { + (void)aclrtFree(workspace_); + } + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); + } + } + + for (size_t i = 0; i < numOutputs_; ++i) { + (void)aclDestroyTensor(outputTensor_[i]); + (void)aclDestroyDataBuffer(outputBuffers_[i]); + (void)aclrtFree(devOutputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); + } + } +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor *inputTensor = + aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0, + GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + devOutputs_.emplace_back(devMem); + outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + + aclTensor *outputTensor = + aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), nullptr, 0, + GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]); + if (outputTensor == nullptr) { + ERROR_LOG("Create Tensor for output[%zu] failed", i); + return false; + } + outputTensor_.emplace_back(outputTensor); + } + + return true; +} + +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} + +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} + +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} + +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} + +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp(int64_t caseId) +{ + caseId_ = caseId; + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor *handle = nullptr; + auto ret = aclnnAddCustomGetWorkspaceSize(inputTensor_[0], inputTensor_[1], caseId, outputTensor_[0], &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); + + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace_, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = aclnnAddCustom(workspace_, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustom success"); + + // The unit of 5000 is ms. + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void)aclrtDestroyStream(stream); + return true; +} + +int64_t OpRunner::GetCaseId() +{ + return caseId_; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} + +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} + +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..90e0ac343405dfabff6c53b178190403fc8287d5 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp @@ -0,0 +1,51 @@ +/** + * @file operator_desc.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "operator_desc.h" + +#include "common.h" + +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto *desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + + for (auto *desc : outputDesc) { + aclDestroyTensorDesc(desc); + } +} + +OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + + outputDesc.emplace_back(desc); + return *this; +} diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom.json b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom.json new file mode 100644 index 0000000000000000000000000000000000000000..b76e8928f31971ccc84e27c013fd15e2e7ffc98f --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom.json @@ -0,0 +1,47 @@ +[ + { + "op": "AddCustom", + "language": "cpp", + "input_desc": [ + { + "name": "x", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + }, + { + "name": "y", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ], + "output_desc": [ + { + "name": "z", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ], + "attr": [ + { + "name": "case_id", + "type": "int", + "value": 1 + } + ] + } +] \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_host/add_custom.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_host/add_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b9cb652e0ea32523bbe96de6050110d3a5698c59 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_host/add_custom.cpp @@ -0,0 +1,49 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "../op_kernel/add_custom_tiling.h" +#include "register/op_def_registry.h" + +namespace optiling { +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + constexpr uint32_t BLOCK_DIM = 40; + context->SetBlockDim(BLOCK_DIM); + + // set tiling_key + auto attrs = context->GetAttrs(); + const int64_t *caseId = attrs->GetInt(0); + context->SetTilingKey(*caseId); + + AddCustomTilingData *tiling = context->GetTilingData(); + // x shape is [5120, 5120], y shape is [5120, 15360], so we set outer loop to 3 + tiling->loopOuter = 3U; + + // set workspace size + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ops { +class AddCustom : public OpDef { +public: + explicit AddCustom(const char *name) : OpDef(name) + { + this->Input("x").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + this->Input("y").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + this->Output("z").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b"); + this->Attr("case_id").Int(1); + } +}; +OP_ADD(AddCustom); +} // namespace ops diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..895e6444fd9256003965765b27d655415a20d7f1 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp @@ -0,0 +1,28 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "add_custom_v1.h" +#include "add_custom_v2.h" + +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) +{ + REGISTER_TILING_DEFAULT(AddCustomTilingData); + GET_TILING_DATA(tilingData, tiling); + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + if (TILING_KEY_IS(1UL)) { + KernelAddV1 op; + op.Init(x, y, z, &tilingData); + op.Process(); + } else if (TILING_KEY_IS(2UL)) { + KernelAddV2 op; + op.Init(x, y, z, &tilingData); + op.Process(); + } +} diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..d865aba897d9775e1adc79843ba903cde3ed8f55 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h @@ -0,0 +1,18 @@ +/** + * @file add_custom_tiling.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_TILING_H +#define ADD_CUSTOM_TILING_H +#include + +class AddCustomTilingData { +public: + uint32_t loopOuter; +}; +#endif // ADD_CUSTOM_TILING_H diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h new file mode 100644 index 0000000000000000000000000000000000000000..086bca4f0e5ca4baa66ffe9d8953f5273b962de8 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h @@ -0,0 +1,102 @@ +/** + * @file add_custom_v1.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_V1_H +#define ADD_CUSTOM_V1_H +#include "kernel_operator.h" +#include "add_custom_tiling.h" + +using AscendC::TPosition; +class KernelAddV1 { +public: + __aicore__ inline KernelAddV1() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData *tilingPtr) + { + tiling = tilingPtr; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); + yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * TILE_N); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + } + __aicore__ inline void Process() + { + for (uint32_t i = 0; i < tiling->loopOuter; i++) { + for (uint32_t j = 0; j < M_A / TILE_M; j++) { + CopyIn(i, j); + Compute(); + CopyOut(i, j); + } + } + } + +private: + __aicore__ inline void CopyIn(uint32_t progressOuter, uint32_t progressInner) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopyParams paramsX; + paramsX.blockCount = TILE_M; + paramsX.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsX.srcStride = (N_A - TILE_N) * sizeof(float) / BLOCK_SIZE; + paramsX.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progressInner * TILE_M * N_A], paramsX); + + AscendC::DataCopyParams paramsY; + paramsY.blockCount = TILE_M; + paramsY.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsY.srcStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE; + paramsY.dstStride = 0; + AscendC::DataCopy(yLocal, yGm[progressOuter * N_A + progressInner * TILE_M * N_B], paramsY); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, TILE_M * TILE_N); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progressOuter, int32_t progressInner) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams paramsZ; + paramsZ.blockCount = TILE_M; + paramsZ.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsZ.srcStride = 0; + paramsZ.dstStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progressOuter * N_A + progressInner * TILE_M * N_B], zLocal, paramsZ); + outQueueZ.FreeTensor(zLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + static constexpr uint32_t M_A = 5120U; + static constexpr uint32_t N_A = M_A; + static constexpr uint32_t M_B = M_A; + static constexpr uint32_t N_B = N_A * 3U; + static constexpr uint32_t TILE_M = 64U; + static constexpr uint32_t TILE_N = 128U; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + AddCustomTilingData *tiling; +}; +#endif // ADD_CUSTOM_V1_H \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h new file mode 100644 index 0000000000000000000000000000000000000000..1f790e84d3ffd56fcb110e3a2080b2291e6b3bd1 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h @@ -0,0 +1,105 @@ +/** + * @file add_custom_v2.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_V2_H +#define ADD_CUSTOM_V2_H +#include "kernel_operator.h" +#include "add_custom_tiling.h" + +using AscendC::TPosition; +class KernelAddV2 { +public: + __aicore__ inline KernelAddV2() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData *tilingPtr) + { + tiling = tilingPtr; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); + yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * TILE_N); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + // disable the l2 cache mode of y and z + yGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + } + __aicore__ inline void Process() + { + for (uint32_t i = 0; i < tiling->loopOuter; i++) { + for (uint32_t j = 0; j < M_A / TILE_M; j++) { + CopyIn(i, j); + Compute(); + CopyOut(i, j); + } + } + } + +private: + __aicore__ inline void CopyIn(uint32_t progressOuter, uint32_t progressInner) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopyParams paramsX; + paramsX.blockCount = TILE_M; + paramsX.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsX.srcStride = (N_A - TILE_N) * sizeof(float) / BLOCK_SIZE; + paramsX.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progressInner * TILE_M * N_A], paramsX); + + AscendC::DataCopyParams paramsY; + paramsY.blockCount = TILE_M; + paramsY.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsY.srcStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE; + paramsY.dstStride = 0; + AscendC::DataCopy(yLocal, yGm[progressOuter * N_A + progressInner * TILE_M * N_B], paramsY); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, TILE_M * TILE_N); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progressOuter, int32_t progressInner) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams paramsZ; + paramsZ.blockCount = TILE_M; + paramsZ.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsZ.srcStride = 0; + paramsZ.dstStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progressOuter * N_A + progressInner * TILE_M * N_B], zLocal, paramsZ); + outQueueZ.FreeTensor(zLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + static constexpr uint32_t M_A = 5120U; + static constexpr uint32_t N_A = M_A; + static constexpr uint32_t M_B = M_A; + static constexpr uint32_t N_B = N_A * 3U; + static constexpr uint32_t TILE_M = 64U; + static constexpr uint32_t TILE_N = 128U; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + AddCustomTilingData *tiling; +}; +#endif // ADD_CUSTOM_V2_H \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md new file mode 100644 index 0000000000000000000000000000000000000000..22f239d00875855e803f5bc6f8714366d25969b7 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md @@ -0,0 +1,162 @@ +## 概述 + +本样例基于AddCustom算子工程,介绍了设置L2 CacheMode的方法以及其影响场景。 + +## 目录结构介绍 + +``` +├── l2_cache_bypass // L2 CacheMode样例工程目录 +│ ├── AclNNInvocation // 通过单算子API调用的方式调用AddCustom算子 +│ ├── AddCustom // AddCustom算子工程 +│ ├── AddCustom.json // AddCustom算子的原型定义json文件 +│ └── install.sh // 脚本,调用msOpGen生成自定义算子工程,并编译 +``` + +## 算子描述 + +Add算子实现了两个Shape不相同的Tensor相加,返回相加结果的功能。对应的数学表达式为: + +``` +z = x + y +``` + +本样例主要介绍数据搬运中设置合理CacheMode对搬运效率的影响,在Global Memory的数据访问中,如果数据只需要访问一次,后续不需要重复读取,那么这种场景下可以设置Global Memory的CacheMode为CACHE_MODE_DISABLED,在这种模式下数据访问将不经过L2 Cache,避免影响需要重复访问的数据,从而提升数据访问效率。 + +本样例中共有2个实现版本: +add_custom_v1.h:基础版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算。 +add_custom_v2.h:在add_custom_v1基础上,设置y/z的CacheMode为CACHE_MODE_DISABLED,避免替换已进入Cache的x数据,影响搬运效率。 + +## 算子规格描述 + + + + + + + + + + + + +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x5120 * 5120floatND
y5120 * 15360floatND
算子输出z5120 * 15360floatND
核函数名add_custom
+ +## 支持的产品型号 + +本样例支持如下产品型号: + +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 算子工程介绍 + +其中,算子工程目录AddCustom包含算子的实现文件,如下所示: + +``` +├── AddCustom // AddCustom自定义算子工程 +│ ├── op_host // host侧实现文件 +│ └── op_kernel // kernel侧实现文件 +``` + +CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通过AddCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 + +创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在AddCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 + +备注:CustomOp目录为生成目录,每次执行install.sh脚本都会删除该目录并重新生成,切勿在该目录下编码算子,会存在丢失风险。 + +## 编译运行样例算子 + +针对自定义算子工程,编译运行包含如下步骤: + +- 调用msOpGen工具生成自定义算子工程; +- 完成算子host和kernel实现; +- 编译自定义算子工程生成自定义算子包; +- 安装自定义算子包到自定义算子库中; +- 调用执行自定义算子; + +详细操作如下所示。 + +### 1. 获取源码包 + +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包。 + +### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 + +- 切换到msOpGen脚本install.sh所在目录 + + ```bash + # 若开发者以git命令行方式clone了master分支代码,并切换目录 + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_l2_cache_bypass/ + ``` +- 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 + + - 方式一:配置环境变量运行脚本 + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量命令。 + - 默认路径,root用户安装CANN软件包 + + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + 运行install.sh脚本 + + ```bash + bash install.sh -v [SOC_VERSION] + ``` + - 方式二:指定命令行安装路径来运行脚本 + ```bash + bash install.sh -v [SOC_VERSION] -i [ASCEND_INSTALL_PATH] + ``` + + 参数说明: + + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - ASCEND_INSTALL_PATH:CANN软件包安装路径 + + 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 + +### 3. 部署自定义算子包 + +- 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH + + ```bash + echo $ASCEND_OPP_PATH + # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp + + # 若没有,则需导出CANN环境变量 + source [ASCEND_INSTALL_PATH]/bin/setenv.bash + # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash + ``` + + 参数说明: + + - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 +- 在自定义算子安装包所在路径下,执行如下命令安装自定义算子包 + + ```bash + cd CustomOp/build_out + ./custom_opp__.run + ``` + + 命令执行成功后,自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。 + +### 4. 调用执行算子工程 + +- [单算子API调用AddCustom算子工程](./AclNNInvocation/README.md) + +## 更新说明 + + +| 时间 | 更新事项 | +| ---------- | -------- | +| 2025/07/14 | 新增样例 | diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/install.sh b/operator/ascendc/4_best_practices/12_l2_cache_bypass/install.sh new file mode 100755 index 0000000000000000000000000000000000000000..09c8bf0aadb22bccd2340b5c7b83007bd7d241d0 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/install.sh @@ -0,0 +1,59 @@ +#!/bin/bash +SHORT=v:,i:, +LONG=soc-version:,install-path:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +# only support Ascend910B2 since different soc version have different cache size +VERSION_LIST="Ascend910B2" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH + +OP_NAME=AddCustom +# Generate the op framework +rm -rf CustomOp && msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp +# Copy op implementation files to CustomOp +rm -rf CustomOp/op_host/*.cpp +rm -rf CustomOp/op_kernel/*.h && rm -rf CustomOp/op_kernel/*.cpp +cp -rf $OP_NAME/op_kernel CustomOp/ +cp -rf $OP_NAME/op_host CustomOp/ + +# Build CustomOp project +(cd CustomOp && bash build.sh) \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index 1ebba2146be3fdca307d6766fce43003eac15e9a..bd20372abb09ef2fd3b24d7b6b688a3733d9fa6c 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -21,10 +21,11 @@ z = x + 2.0 ``` 本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在Global Memory的数据访问中,数据访问请求(读/写)在AI 处理器内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,AI 处理器会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 -本样例中共有3个实现版本: -adds_custom_v1.h:基础实现版本,每个核的计算顺序一致,存在同地址冲突,带宽效率较差 -adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 -adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 + +本样例中共有3个实现版本: +adds_custom_v1.h:基础实现版本,每个核的计算顺序一致,存在同地址冲突,带宽效率较差。 +adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突。 +adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突。 当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 diff --git a/operator/ascendc/4_best_practices/README.md b/operator/ascendc/4_best_practices/README.md index c40fe61a7e48c83d552db737f1cd0bfdb573a880..926e4a6ef365a8036c64b460faf92ed9b581d7a0 100644 --- a/operator/ascendc/4_best_practices/README.md +++ b/operator/ascendc/4_best_practices/README.md @@ -8,6 +8,7 @@ | ------------------------------- | ------------------------------------------ | ------------------------------------------ | | [4_bank_conflict](./4_bank_conflict) | 基于Ascend C的bank冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [6_group_matmul](./6_group_matmul) | 基于Ascend C的group matmul算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | +| [12_l2_cache_bypass](./12_l2_cache_bypass) | 基于Ascend C的L2 CaCheMode算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [15_mata_address_conflict](./15_mata_address_conflict) | 基于Ascend C的同地址冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [21_all_gather_matmul_custom](./21_all_gather_matmul_custom) | 基于Ascend C的AllGatherMatmul算子性能调优样例 | Atlas A2训练系列产品 | | [22_matmul_reduce_scatter_custom](./22_matmul_reduce_scatter_custom) | 基于Ascend C的MatmulReduceScatter算子性能调优样例 | Atlas A2训练系列产品 | @@ -45,6 +46,8 @@ ## 更新说明 | 时间 | 更新事项 | | ---------- | -------------------------------------------- | +| 2025/07/14 | 新增12_l2_cache_bypass样例 | +| 2025/07/03 | 新增15_mata_address_conflict样例 | | 2025/07/01 | 新增4_bank_conflict样例 | | 2024/12/19 | 新增23_matmul_all_reduce_custom样例 | | 2024/12/19 | 新增22_matmul_reduce_scatter_custom样例 |