From b7c98567b3ab7c59a1fef80b0ff5935132d93454 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 14:13:32 +0800 Subject: [PATCH 01/19] add l2 cache bypass cases --- .../l2_cache_bypass/AclNNInvocation/README.md | 75 +++ .../AclNNInvocation/inc/common.h | 45 ++ .../AclNNInvocation/inc/op_runner.h | 188 +++++++ .../AclNNInvocation/inc/operator_desc.h | 57 +++ .../AclNNInvocation/input/.keep | 0 .../l2_cache_bypass/AclNNInvocation/run.sh | 78 +++ .../AclNNInvocation/scripts/acl.json | 1 + .../AclNNInvocation/scripts/gen_data.py | 28 ++ .../AclNNInvocation/scripts/verify_result.py | 53 ++ .../AclNNInvocation/src/CMakeLists.txt | 65 +++ .../AclNNInvocation/src/common.cpp | 80 +++ .../AclNNInvocation/src/main.cpp | 169 +++++++ .../AclNNInvocation/src/op_runner.cpp | 462 ++++++++++++++++++ .../AclNNInvocation/src/operator_desc.cpp | 51 ++ .../l2_cache_bypass/AddCustom.json | 47 ++ .../AddCustom/op_host/add_custom.cpp | 61 +++ .../AddCustom/op_kernel/add_custom.cpp | 27 + .../AddCustom/op_kernel/add_custom_tiling.h | 24 + .../AddCustom/op_kernel/add_custom_v1.h | 109 +++++ .../l2_cache_bypass/README.md | 161 ++++++ .../l2_cache_bypass/install.sh | 59 +++ 21 files changed, 1840 insertions(+) create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/common.h create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/op_runner.h create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/operator_desc.h create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/input/.keep create mode 100755 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/acl.json create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/verify_result.py create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/common.cpp create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/op_runner.cpp create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom.json create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md create mode 100755 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/install.sh diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md new file mode 100644 index 000000000..5c1ffb4d2 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md @@ -0,0 +1,75 @@ +## 目录结构介绍 + +``` +├── AclNNInvocation //通过单算子API调用的方式调用AddsCustom算子 +│ ├── inc // 头文件目录 +│ │ ├── common.h // 声明公共方法类,用于读取二进制文件 +│ │ ├── op_runner.h // 算子描述声明文件,包含算子输入/输出,算子类型以及输入描述与输出描述 +│ │ └── operator_desc.h // 算子运行相关信息声明文件,包含算子输入/输出个数,输入/输出大小等 +│ ├── input // 存放脚本生成的输入数据目录 +│ ├── scripts +│ │ ├── acl.json // acl配置文件 +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ │ └── verify_result.py // 精度校验脚本 +│ ├── src +│ │ ├── CMakeLists.txt // 编译规则文件 +│ │ ├── common.cpp // 公共函数,读取二进制文件函数的实现文件 +│ │ ├── main.cpp // 单算子调用应用的入口 +│ │ ├── op_runner.cpp // 单算子调用主体流程实现文件 +│ │ └── operator_desc.cpp // 构造算子的输入与输出描述 +│ └── run.sh // 执行命令脚本 +``` + +## 代码实现介绍 + +完成自定义算子的开发部署后,可以通过单算子调用的方式来验证单算子的功能。src/main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子,无需提供单算子描述文件进行离线模型的转换,直接调用单算子API接口。 + +自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: + +```cpp + // 获取算子使用的workspace空间大小 + aclnnStatus aclnnAddsCustomGetWorkspaceSize( + const aclTensor *x, + int64_t caseId, + const aclTensor *out, + uint64_t *workspaceSize, + aclOpExecutor **executor); + // 执行算子 + aclnnStatus aclnnAddsCustom( + void *workspace, + uint64_t workspaceSize, + aclOpExecutor *executor, + aclrtStream stream); +``` + +其中aclnnAddsCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddsCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 + +## 运行样例算子 + +### 1. 编译算子工程 + +运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 + +### 2. 单算子API调用样例运行 + +- 进入到样例目录 + + 以命令行方式下载样例代码,master分支为例。 + + ```bash + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation + ``` +- 样例执行 + + 样例执行过程中会自动生成测试数据,然后编译与运行单算子API调用样例,最后检验运行结果。具体过程可参见run.sh脚本。 + + ```bash + bash run.sh + ``` + +## 更新说明 + + +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/07/03 | 新增本readme | diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/common.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/common.h new file mode 100644 index 000000000..fadb5c808 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/common.h @@ -0,0 +1,45 @@ +/** + * @file common.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size); + +#endif // COMMON_H diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/op_runner.h new file mode 100644 index 000000000..7b98d5730 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/op_runner.h @@ -0,0 +1,188 @@ +/** + * @file op_runner.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "acl/acl.h" +#include "aclnn/acl_meta.h" +#include "common.h" +#include "operator_desc.h" + +/** + * Op Runner + */ +class OpRunner { +public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + const size_t GetInputNumDims(size_t index) const; + aclDataType GetInputDataType(size_t index) const; + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + const size_t GetOutputNumDims(size_t index) const; + aclDataType GetOutputDataType(size_t index) const; + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(int64_t caseId); + + /** + * @brief Get case index + * @return case index by user input + */ + int64_t GetCaseId(); + +private: + size_t numInputs_; + size_t numOutputs_; + void *workspace_; + int64_t caseId_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; +}; + +#endif // OP_RUNNER_H diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/operator_desc.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/operator_desc.h new file mode 100644 index 000000000..cf02d7cec --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/operator_desc.h @@ -0,0 +1,57 @@ +/** + * @file operator_desc.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +/** + * Op description + */ +struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; +}; + +#endif // OPERATOR_DESC_H diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/input/.keep b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/input/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh new file mode 100755 index 000000000..057249612 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh @@ -0,0 +1,78 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export DDK_PATH=$_ASCEND_INSTALL_PATH +export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib + +function main { + # 1. 清除遗留生成文件和日志文件 + rm -rf $HOME/ascend/log/* + rm -rf ./input && mkdir -p ./input + rm -rf ./output && mkdir -p ./output + + # 2. 生成输入数据和真值数据 + cd $CURRENT_DIR + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + # 3. 编译可执行文件 + cd $CURRENT_DIR + rm -rf build + mkdir -p build + cd build + cmake ../src -DCMAKE_SKIP_RPATH=TRUE + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + # 4. 运行可执行文件 + export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH + export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} + cd $CURRENT_DIR/output + echo "INFO: execute op!" + msprof op --launch-count=3 --output=./prof ./execute_add_op + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + + # 5. 精度比对 + cd $CURRENT_DIR + python3 scripts/verify_result.py output/output_z_1.bin output/golden.bin + # python3 scripts/verify_result.py output/output_z_2.bin output/golden.bin + # python3 scripts/verify_result.py output/output_z_3.bin output/golden.bin + if [ $? -ne 0 ]; then + echo "ERROR: verify result failed!" + return 1 + fi +} + +main diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/acl.json b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/acl.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/acl.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py new file mode 100644 index 000000000..cc13250b7 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py @@ -0,0 +1,28 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np + + +def gen_golden_data_simple(): + row = 5120 + col = 5120 + input_x = np.random.uniform(1, 10, [row, col]).astype(np.float32) + input_y = np.random.uniform(1, 20, [row, col * 3]).astype(np.float32) + y_blocks = np.split(input_y, 3, axis=1) + result_blocks = [input_x + block for block in y_blocks] + golden = np.hstack(result_blocks) + input_x.tofile("./input/input_x.bin") + input_x.tofile("./input/input_y.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/verify_result.py new file mode 100644 index 000000000..a5019f30f --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float32 +relative_tol = 1e-4 +absolute_tol = 1e-5 +error_tol = 1e-4 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt new file mode 100644 index 000000000..32bed518d --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt @@ -0,0 +1,65 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_add) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ../inc + ${INC_PATH}/include + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_add_op + operator_desc.cpp + op_runner.cpp + main.cpp + common.cpp +) + +target_link_libraries(execute_add_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_add_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/common.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/common.cpp new file mode 100644 index 000000000..d58716122 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/common.cpp @@ -0,0 +1,80 @@ +/** + * @file common.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "common.h" + +#include +#include +#include + +#include + +extern bool g_isDevice; + +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp new file mode 100644 index 000000000..38fce310c --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp @@ -0,0 +1,169 @@ +/** + * @file main.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "common.h" +#include "op_runner.h" + +bool g_isDevice = false; +int deviceId = 0; + +OperatorDesc CreateOpDesc() +{ + // define operator + constexpr uint32_t ROW = 5120; + constexpr uint32_t COL = 5120; + std::vector shapeX{ROW, COL}; + std::vector shapeY{ROW, COL*3}; + std::vector shapeZ{ROW, COL*3}; + aclDataType dataType = ACL_FLOAT; + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(dataType, shapeX.size(), shapeX.data(), format); + opDesc.AddInputTensorDesc(dataType, shapeY.size(), shapeY.data(), format); + opDesc.AddOutputTensorDesc(dataType, shapeZ.size(), shapeZ.data(), format); + return opDesc; +} + +bool SetInputData(OpRunner &runner) +{ + size_t fileSize = 0; + ReadFile("../input/input_x.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0)); + ReadFile("../input/input_y.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1)); + INFO_LOG("Set input success"); + return true; +} + +bool ProcessOutputData(OpRunner &runner) +{ + int64_t caseId = runner.GetCaseId(); + WriteFile("../output/output_z_" + std::to_string(caseId) + ".bin", runner.GetOutputBuffer(0), + runner.GetOutputSize(0)); + INFO_LOG("Write output success"); + return true; +} + +void DestroyResource() +{ + bool flag = false; + if (aclrtResetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destroy resource failed"); + } else { + INFO_LOG("Destroy resource success"); + } +} + +bool InitResource() +{ + std::string output = "../output"; + + // acl.json is dump or profiling config file + if (aclInit("../scripts/acl.json") != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + if (aclrtSetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. deviceId is %d", deviceId); + (void)aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestroyResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; +} + +bool RunOp(int64_t caseId) +{ + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp(caseId)) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + + INFO_LOG("Run op success"); + return true; +} + +int main(int argc, char **argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + int64_t caseId = 1; + if (!RunOp(caseId)) { + DestroyResource(); + return FAILED; + } + + caseId = 2; + if (!RunOp(caseId)) { + DestroyResource(); + return FAILED; + } + + caseId = 3; + if (!RunOp(caseId)) { + DestroyResource(); + return FAILED; + } + + DestroyResource(); + return SUCCESS; +} diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/op_runner.cpp new file mode 100644 index 000000000..36d197bc5 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/op_runner.cpp @@ -0,0 +1,462 @@ +/** + * @file op_runner.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "op_runner.h" + +#include +#include + +#include "acl/acl_op_compiler.h" +#include "aclnn_add_custom.h" +#include "common.h" + +using namespace std; + +extern bool g_isDevice; + +OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); + workspace_ = nullptr; +} + +OpRunner::~OpRunner() +{ + if (workspace_ != nullptr) { + (void)aclrtFree(workspace_); + } + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); + } + } + + for (size_t i = 0; i < numOutputs_; ++i) { + (void)aclDestroyTensor(outputTensor_[i]); + (void)aclDestroyDataBuffer(outputBuffers_[i]); + (void)aclrtFree(devOutputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); + } + } +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor *inputTensor = + aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0, + GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + devOutputs_.emplace_back(devMem); + outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + + aclTensor *outputTensor = + aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), nullptr, 0, + GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]); + if (outputTensor == nullptr) { + ERROR_LOG("Create Tensor for output[%zu] failed", i); + return false; + } + outputTensor_.emplace_back(outputTensor); + } + + return true; +} + +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} + +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} + +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} + +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} + +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp(int64_t caseId) +{ + caseId_ = caseId; + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor *handle = nullptr; + auto ret = aclnnAddCustomGetWorkspaceSize(inputTensor_[0], inputTensor_[1], caseId, outputTensor_[0], &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); + + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace_, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = aclnnAddCustom(workspace_, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustom success"); + + // The unit of 5000 is ms. + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void)aclrtDestroyStream(stream); + return true; +} + +int64_t OpRunner::GetCaseId() +{ + return caseId_; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} + +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} + +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp new file mode 100644 index 000000000..90e0ac343 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp @@ -0,0 +1,51 @@ +/** + * @file operator_desc.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "operator_desc.h" + +#include "common.h" + +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto *desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + + for (auto *desc : outputDesc) { + aclDestroyTensorDesc(desc); + } +} + +OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + + outputDesc.emplace_back(desc); + return *this; +} diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom.json b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom.json new file mode 100644 index 000000000..b76e8928f --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom.json @@ -0,0 +1,47 @@ +[ + { + "op": "AddCustom", + "language": "cpp", + "input_desc": [ + { + "name": "x", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + }, + { + "name": "y", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ], + "output_desc": [ + { + "name": "z", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ], + "attr": [ + { + "name": "case_id", + "type": "int", + "value": 1 + } + ] + } +] \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp new file mode 100644 index 000000000..b7e19fd43 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp @@ -0,0 +1,61 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "../op_kernel/add_custom_tiling.h" +#include "register/op_def_registry.h" + +namespace optiling { +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + constexpr uint32_t BLOCK_DIM = 40; + context->SetBlockDim(BLOCK_DIM); + + // set tiling_key + auto attrs = context->GetAttrs(); + const int64_t *caseId = attrs->GetInt(0); + context->SetTilingKey(*caseId); + + AddCustomTilingData *tiling = context->GetTilingData(); + constexpr uint32_t M_A = 128 * BLOCK_DIM; + constexpr uint32_t N_A = M_A; + constexpr uint32_t M_B = M_A; + constexpr uint32_t N_B = M_A * 3; + constexpr uint32_t TILE_M = 64; + constexpr uint32_t TILE_N = 128; + constexpr uint32_t LOOP_ONE_CORE = M_A / TILE_M; + tiling->mA = M_A; + tiling->nA = N_A; + tiling->mB = M_B; + tiling->nB = N_B; + tiling->tileM = TILE_M; + tiling->tileN = TILE_N; + tiling->loopOneCore = LOOP_ONE_CORE; + + // set workspace size + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ops { +class AddCustom : public OpDef { +public: + explicit AddCustom(const char *name) : OpDef(name) + { + this->Input("x").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + this->Input("y").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + this->Output("z").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b"); + this->Attr("case_id").Int(1); + } +}; +OP_ADD(AddCustom); +} // namespace ops diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp new file mode 100644 index 000000000..c2111913d --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp @@ -0,0 +1,27 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "add_custom_v1.h" + +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) +{ + REGISTER_TILING_DEFAULT(AddCustomTilingData); + GET_TILING_DATA(tilingData, tiling); + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + if (TILING_KEY_IS(1UL)) { + KernelAddV1 op; + op.Init(x, y, z, &tilingData); + op.Process(); + } else if (TILING_KEY_IS(2UL)) { + // KernelAddsV2 op; + // op.Init(x, z, &tilingData); + // op.Process(); + } +} diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h new file mode 100644 index 000000000..a034c0dba --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h @@ -0,0 +1,24 @@ +/** + * @file add_custom_tiling.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_TILING_H +#define ADD_CUSTOM_TILING_H +#include + +class AddCustomTilingData { +public: + uint32_t mA; + uint32_t nA; + uint32_t mB; + uint32_t nB; + uint32_t tileM; + uint32_t tileN; + uint32_t loopOneCore; +}; +#endif // ADD_CUSTOM_TILING_H diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h new file mode 100644 index 000000000..f092b9c12 --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h @@ -0,0 +1,109 @@ +/** + * @file add_custom_v1.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_V1_H +#define ADD_CUSTOM_V1_H +#include "kernel_operator.h" +#include "add_custom_tiling.h" + +static constexpr uint32_t M_A = 128*40; +static constexpr uint32_t N_A = M_A; +static constexpr uint32_t M_B = M_A; +static constexpr uint32_t N_B = N_A * 3; +static constexpr uint32_t TILE_M = 64; +static constexpr uint32_t TILE_N = 128; + +using AscendC::TPosition; +class KernelAddV1 { +public: + __aicore__ inline KernelAddV1() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData *tilingPtr) + { + tiling = tilingPtr; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); + yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * TILE_N); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + // // the gm address conflict happens when multi cores visit the same addr range(512Bytes) + // // we disable the L2 cache mode to highlight the influence of the gm address conflict + // xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + // zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int i = 0; i < 3; i++){ + // 最外层循环,因为B大小时A的3倍 + for (int j = 0; j < M_A / TILE_M; j++) { + // 内层以tile块切分 + CopyIn(i, j); + Compute(); + CopyOut(i, j); + } + } + } + +private: + __aicore__ inline void CopyIn(int32_t progressOuter, int32_t progressInner) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopyParams paramsX; + paramsX.blockCount = TILE_M; + paramsX.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsX.srcStride = (N_A - TILE_N) * sizeof(float) / BLOCK_SIZE; + paramsX.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progressInner * TILE_M * N_A], paramsX); + + AscendC::DataCopyParams paramsY; + paramsY.blockCount = TILE_M; + paramsY.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsY.srcStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE; + paramsY.dstStride = 0; + AscendC::DataCopy(yLocal, yGm[progressOuter * N_A + progressInner * TILE_M * N_B], paramsY); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, TILE_M * TILE_N); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progressOuter, int32_t progressInner) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams paramsZ; + paramsZ.blockCount = TILE_M; + paramsZ.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsZ.srcStride = 0; + paramsZ.dstStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progressOuter * N_A + progressInner * TILE_M * N_B], zLocal, paramsZ); + outQueueZ.FreeTensor(zLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + AddCustomTilingData *tiling; +}; +#endif // ADD_CUSTOM_V1_H \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md new file mode 100644 index 000000000..03fdd24fc --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md @@ -0,0 +1,161 @@ +## 概述 + +本样例基于AddsCustom算子工程,介绍了同地址冲突的影响以及两种解决方法。 + +## 目录结构介绍 + +``` +├── 15_mata_address_conflict // 同地址冲突样例工程目录 +│ ├── AclNNInvocation // 通过单算子API调用的方式调用AddsCustom算子 +│ ├── AddsCustom // AddsCustom算子工程 +│ ├── AddsCustom.json // AddsCustom算子的原型定义json文件 +│ └── install.sh // 脚本,调用msOpGen生成自定义算子工程,并编译 +``` + +## 算子描述 + +Adds算子实现了一个Tensor与标量值2.0相加,返回相加结果的功能。对应的数学表达式为: + +``` +z = x + 2.0 +``` + +本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在global memory的数据访问中,数据访问请求(读/写)在芯片内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,芯片会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 + +当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 + +## 算子规格描述 + + + + + + + + + + + +
算子类型(OpType)Adds
算子输入nameshapedata typeformat
x8192 * 128floatND
算子输出z8192 * 128floatND
核函数名adds_custom
+ +## 支持的产品型号 + +本样例支持如下产品型号: + +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 算子工程介绍 + +其中,算子工程目录AddsCustom包含算子的实现文件,如下所示: + +``` +├── AddsCustom // AddsCustom自定义算子工程 +│ ├── op_host // host侧实现文件 +│ └── op_kernel // kernel侧实现文件 +``` + +CANN软件包中提供了工程创建工具msOpGen,AddsCustom算子工程可通过AddsCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 + +创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在AddsCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 + +备注:CustomOp目录为生成目录,每次执行install.sh脚本都会删除该目录并重新生成,切勿在该目录下编码算子,会存在丢失风险。 + +## 编译运行样例算子 + +针对自定义算子工程,编译运行包含如下步骤: + +- 调用msOpGen工具生成自定义算子工程; +- 完成算子host和kernel实现; +- 编译自定义算子工程生成自定义算子包; +- 安装自定义算子包到自定义算子库中; +- 调用执行自定义算子; + +详细操作如下所示。 + +### 1. 获取源码包 + +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包。 + +### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 + +- 切换到msOpGen脚本install.sh所在目录 + + ```bash + # 若开发者以git命令行方式clone了master分支代码,并切换目录 + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict + ``` +- 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 + + - 方式一:配置环境变量运行脚本 + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量命令。 + - 默认路径,root用户安装CANN软件包 + + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + 运行install.sh脚本 + + ```bash + bash install.sh -v [SOC_VERSION] + ``` + - 方式二:指定命令行安装路径来运行脚本 + ```bash + bash install.sh -v [SOC_VERSION] -i [ASCEND_INSTALL_PATH] + ``` + + 参数说明: + + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - ASCEND_INSTALL_PATH:CANN软件包安装路径 + + 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 + + 备注:如果要使用dump调试功能,需要移除op_host内的Atlas 训练系列产品、Atlas 200/500 A2 推理产品的配置项。 + +### 3. 部署自定义算子包 + +- 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH + + ```bash + echo $ASCEND_OPP_PATH + # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp + + # 若没有,则需导出CANN环境变量 + source [ASCEND_INSTALL_PATH]/bin/setenv.bash + # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash + ``` + + 参数说明: + + - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 +- 在自定义算子安装包所在路径下,执行如下命令安装自定义算子包 + + ```bash + cd CustomOp/build_out + ./custom_opp__.run + ``` + + 命令执行成功后,自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。 + +### 4. 调用执行算子工程 + +- [单算子API调用AddsCustom算子工程](./AclNNInvocation/README.md) + +## 更新说明 + + +| 时间 | 更新事项 | +| ---------- | -------- | +| 2025/07/03 | 新增样例 | diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/install.sh b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/install.sh new file mode 100755 index 000000000..09c8bf0aa --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/install.sh @@ -0,0 +1,59 @@ +#!/bin/bash +SHORT=v:,i:, +LONG=soc-version:,install-path:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +# only support Ascend910B2 since different soc version have different cache size +VERSION_LIST="Ascend910B2" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH + +OP_NAME=AddCustom +# Generate the op framework +rm -rf CustomOp && msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp +# Copy op implementation files to CustomOp +rm -rf CustomOp/op_host/*.cpp +rm -rf CustomOp/op_kernel/*.h && rm -rf CustomOp/op_kernel/*.cpp +cp -rf $OP_NAME/op_kernel CustomOp/ +cp -rf $OP_NAME/op_host CustomOp/ + +# Build CustomOp project +(cd CustomOp && bash build.sh) \ No newline at end of file -- Gitee From 548639e6a2fd127f26781cc18633426725c4dd8e Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 15:42:00 +0800 Subject: [PATCH 02/19] add case1 and case2 --- .../AclNNInvocation/input/.keep | 0 .../l2_cache_bypass/AclNNInvocation/run.sh | 5 +- .../AclNNInvocation/scripts/gen_data.py | 4 +- .../AclNNInvocation/src/main.cpp | 6 - .../AddCustom/op_host/add_custom.cpp | 15 +-- .../AddCustom/op_kernel/add_custom.cpp | 7 +- .../AddCustom/op_kernel/add_custom_tiling.h | 8 +- .../AddCustom/op_kernel/add_custom_v1.h | 22 ++-- .../AddCustom/op_kernel/add_custom_v2.h | 106 ++++++++++++++++++ 9 files changed, 124 insertions(+), 49 deletions(-) delete mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/input/.keep create mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/input/.keep b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/input/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh index 057249612..abd856721 100755 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh @@ -57,7 +57,7 @@ function main { export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} cd $CURRENT_DIR/output echo "INFO: execute op!" - msprof op --launch-count=3 --output=./prof ./execute_add_op + msprof --application="./execute_add_op" --ai-core=on --l2=on ./execute_add_op if [ $? -ne 0 ]; then echo "ERROR: acl executable run failed! please check your project!" return 1 @@ -67,8 +67,7 @@ function main { # 5. 精度比对 cd $CURRENT_DIR python3 scripts/verify_result.py output/output_z_1.bin output/golden.bin - # python3 scripts/verify_result.py output/output_z_2.bin output/golden.bin - # python3 scripts/verify_result.py output/output_z_3.bin output/golden.bin + python3 scripts/verify_result.py output/output_z_2.bin output/golden.bin if [ $? -ne 0 ]; then echo "ERROR: verify result failed!" return 1 diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py index cc13250b7..17b3d7119 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py @@ -15,12 +15,12 @@ def gen_golden_data_simple(): row = 5120 col = 5120 input_x = np.random.uniform(1, 10, [row, col]).astype(np.float32) - input_y = np.random.uniform(1, 20, [row, col * 3]).astype(np.float32) + input_y = np.random.uniform(1, 10, [row, col * 3]).astype(np.float32) y_blocks = np.split(input_y, 3, axis=1) result_blocks = [input_x + block for block in y_blocks] golden = np.hstack(result_blocks) input_x.tofile("./input/input_x.bin") - input_x.tofile("./input/input_y.bin") + input_y.tofile("./input/input_y.bin") golden.tofile("./output/golden.bin") diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp index 38fce310c..d727b0a29 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp @@ -158,12 +158,6 @@ int main(int argc, char **argv) return FAILED; } - caseId = 3; - if (!RunOp(caseId)) { - DestroyResource(); - return FAILED; - } - DestroyResource(); return SUCCESS; } diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp index b7e19fd43..5e098faed 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp @@ -22,20 +22,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context) context->SetTilingKey(*caseId); AddCustomTilingData *tiling = context->GetTilingData(); - constexpr uint32_t M_A = 128 * BLOCK_DIM; - constexpr uint32_t N_A = M_A; - constexpr uint32_t M_B = M_A; - constexpr uint32_t N_B = M_A * 3; - constexpr uint32_t TILE_M = 64; - constexpr uint32_t TILE_N = 128; - constexpr uint32_t LOOP_ONE_CORE = M_A / TILE_M; - tiling->mA = M_A; - tiling->nA = N_A; - tiling->mB = M_B; - tiling->nB = N_B; - tiling->tileM = TILE_M; - tiling->tileN = TILE_N; - tiling->loopOneCore = LOOP_ONE_CORE; + tiling->loopOuter = 3; // set workspace size size_t *currentWorkspace = context->GetWorkspaceSizes(1); diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp index c2111913d..895e6444f 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp @@ -9,6 +9,7 @@ */ #include "kernel_operator.h" #include "add_custom_v1.h" +#include "add_custom_v2.h" extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { @@ -20,8 +21,8 @@ extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z op.Init(x, y, z, &tilingData); op.Process(); } else if (TILING_KEY_IS(2UL)) { - // KernelAddsV2 op; - // op.Init(x, z, &tilingData); - // op.Process(); + KernelAddV2 op; + op.Init(x, y, z, &tilingData); + op.Process(); } } diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h index a034c0dba..d865aba89 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h @@ -13,12 +13,6 @@ class AddCustomTilingData { public: - uint32_t mA; - uint32_t nA; - uint32_t mB; - uint32_t nB; - uint32_t tileM; - uint32_t tileN; - uint32_t loopOneCore; + uint32_t loopOuter; }; #endif // ADD_CUSTOM_TILING_H diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h index f092b9c12..d4641ed64 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h @@ -12,13 +12,6 @@ #include "kernel_operator.h" #include "add_custom_tiling.h" -static constexpr uint32_t M_A = 128*40; -static constexpr uint32_t N_A = M_A; -static constexpr uint32_t M_B = M_A; -static constexpr uint32_t N_B = N_A * 3; -static constexpr uint32_t TILE_M = 64; -static constexpr uint32_t TILE_N = 128; - using AscendC::TPosition; class KernelAddV1 { public: @@ -29,20 +22,14 @@ public: xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * TILE_N); zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); - // // the gm address conflict happens when multi cores visit the same addr range(512Bytes) - // // we disable the L2 cache mode to highlight the influence of the gm address conflict - // xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); - // zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); } __aicore__ inline void Process() { - for (int i = 0; i < 3; i++){ - // 最外层循环,因为B大小时A的3倍 + for (int i = 0; i < tiling->loopOuter; i++){ for (int j = 0; j < M_A / TILE_M; j++) { - // 内层以tile块切分 CopyIn(i, j); Compute(); CopyOut(i, j); @@ -96,6 +83,13 @@ private: private: static constexpr int32_t BUFFER_NUM = 2; static constexpr int32_t BLOCK_SIZE = 32; + static constexpr uint32_t M_A = 128*40; + static constexpr uint32_t N_A = M_A; + static constexpr uint32_t M_B = M_A; + static constexpr uint32_t N_B = N_A * 3; + static constexpr uint32_t TILE_M = 64; + static constexpr uint32_t TILE_N = 128; + AscendC::TPipe pipe; AscendC::TQue inQueueX; diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h new file mode 100644 index 000000000..eb21f429d --- /dev/null +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h @@ -0,0 +1,106 @@ +/** + * @file add_custom_v2.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_V2_H +#define ADD_CUSTOM_V2_H +#include "kernel_operator.h" +#include "add_custom_tiling.h" + +using AscendC::TPosition; +class KernelAddV2 { +public: + __aicore__ inline KernelAddV2() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData *tilingPtr) + { + tiling = tilingPtr; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); + yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * TILE_N); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + // disable the l2 cache mode of y and z + yGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int i = 0; i < tiling->loopOuter; i++){ + for (int j = 0; j < M_A / TILE_M; j++) { + CopyIn(i, j); + Compute(); + CopyOut(i, j); + } + } + } + +private: + __aicore__ inline void CopyIn(int32_t progressOuter, int32_t progressInner) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopyParams paramsX; + paramsX.blockCount = TILE_M; + paramsX.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsX.srcStride = (N_A - TILE_N) * sizeof(float) / BLOCK_SIZE; + paramsX.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progressInner * TILE_M * N_A], paramsX); + + AscendC::DataCopyParams paramsY; + paramsY.blockCount = TILE_M; + paramsY.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsY.srcStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE; + paramsY.dstStride = 0; + AscendC::DataCopy(yLocal, yGm[progressOuter * N_A + progressInner * TILE_M * N_B], paramsY); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, TILE_M * TILE_N); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progressOuter, int32_t progressInner) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams paramsZ; + paramsZ.blockCount = TILE_M; + paramsZ.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + paramsZ.srcStride = 0; + paramsZ.dstStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progressOuter * N_A + progressInner * TILE_M * N_B], zLocal, paramsZ); + outQueueZ.FreeTensor(zLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + static constexpr uint32_t M_A = 128*40; + static constexpr uint32_t N_A = M_A; + static constexpr uint32_t M_B = M_A; + static constexpr uint32_t N_B = N_A * 3; + static constexpr uint32_t TILE_M = 64; + static constexpr uint32_t TILE_N = 128; + + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + AddCustomTilingData *tiling; +}; +#endif // ADD_CUSTOM_V2_H \ No newline at end of file -- Gitee From 0f1a70088699072820f9928a37a627e051146834 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 15:58:04 +0800 Subject: [PATCH 03/19] fix code style --- .../l2_cache_bypass/AclNNInvocation/run.sh | 2 +- .../l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h | 4 ++-- .../l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh index abd856721..894fec61c 100755 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh @@ -57,7 +57,7 @@ function main { export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} cd $CURRENT_DIR/output echo "INFO: execute op!" - msprof --application="./execute_add_op" --ai-core=on --l2=on ./execute_add_op + msprof --application="./execute_add_op" --ai-core=on --l2=on --output=./prof if [ $? -ne 0 ]; then echo "ERROR: acl executable run failed! please check your project!" return 1 diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h index d4641ed64..5111e4c4c 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h @@ -28,7 +28,7 @@ public: } __aicore__ inline void Process() { - for (int i = 0; i < tiling->loopOuter; i++){ + for (int i = 0; i < tiling->loopOuter; i++) { for (int j = 0; j < M_A / TILE_M; j++) { CopyIn(i, j); Compute(); @@ -83,7 +83,7 @@ private: private: static constexpr int32_t BUFFER_NUM = 2; static constexpr int32_t BLOCK_SIZE = 32; - static constexpr uint32_t M_A = 128*40; + static constexpr uint32_t M_A = 128 * 40; static constexpr uint32_t N_A = M_A; static constexpr uint32_t M_B = M_A; static constexpr uint32_t N_B = N_A * 3; diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h index eb21f429d..0eee75412 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h @@ -31,7 +31,7 @@ public: } __aicore__ inline void Process() { - for (int i = 0; i < tiling->loopOuter; i++){ + for (int i = 0; i < tiling->loopOuter; i++) { for (int j = 0; j < M_A / TILE_M; j++) { CopyIn(i, j); Compute(); @@ -86,7 +86,7 @@ private: private: static constexpr int32_t BUFFER_NUM = 2; static constexpr int32_t BLOCK_SIZE = 32; - static constexpr uint32_t M_A = 128*40; + static constexpr uint32_t M_A = 128 * 40; static constexpr uint32_t N_A = M_A; static constexpr uint32_t M_B = M_A; static constexpr uint32_t N_B = N_A * 3; -- Gitee From 2ae37e32701483ed5ed82ad5b3eefcfcfe544852 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 16:51:02 +0800 Subject: [PATCH 04/19] fix readme --- .../12_cachemiss_preload_dcci/README.md | 1 - .../l2_cache_bypass/AclNNInvocation/README.md | 35 +++++++------- .../l2_cache_bypass/README.md | 48 +++++++++---------- 3 files changed, 42 insertions(+), 42 deletions(-) delete mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md deleted file mode 100644 index ae31f00d7..000000000 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md +++ /dev/null @@ -1 +0,0 @@ -CACHE MISS优化 preload dcci(待补充) \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md index 5c1ffb4d2..d9b357205 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md @@ -1,7 +1,7 @@ ## 目录结构介绍 ``` -├── AclNNInvocation //通过单算子API调用的方式调用AddsCustom算子 +├── AclNNInvocation //通过单算子API调用的方式调用AddCustom算子 │ ├── inc // 头文件目录 │ │ ├── common.h // 声明公共方法类,用于读取二进制文件 │ │ ├── op_runner.h // 算子描述声明文件,包含算子输入/输出,算子类型以及输入描述与输出描述 @@ -27,22 +27,23 @@ 自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: ```cpp - // 获取算子使用的workspace空间大小 - aclnnStatus aclnnAddsCustomGetWorkspaceSize( - const aclTensor *x, - int64_t caseId, - const aclTensor *out, - uint64_t *workspaceSize, - aclOpExecutor **executor); - // 执行算子 - aclnnStatus aclnnAddsCustom( - void *workspace, - uint64_t workspaceSize, - aclOpExecutor *executor, - aclrtStream stream); +// 获取算子使用的workspace空间大小 +aclnnStatus aclnnAddCustomGetWorkspaceSize( + const aclTensor *x, + const aclTensor *y, + int64_t caseId, + const aclTensor *out, + uint64_t *workspaceSize, + aclOpExecutor **executor); +// 执行算子 +aclnnStatus aclnnAddCustom( + void *workspace, + uint64_t workspaceSize, + aclOpExecutor *executor, + aclrtStream stream); ``` -其中aclnnAddsCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddsCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 +其中aclnnAddCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 ## 运行样例算子 @@ -57,7 +58,7 @@ 以命令行方式下载样例代码,master分支为例。 ```bash - cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation ``` - 样例执行 @@ -72,4 +73,4 @@ | 时间 | 更新事项 | | ---------- | ------------ | -| 2025/07/03 | 新增本readme | +| 2025/07/14 | 新增本readme | diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md index 03fdd24fc..872679a07 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md @@ -1,41 +1,43 @@ ## 概述 -本样例基于AddsCustom算子工程,介绍了同地址冲突的影响以及两种解决方法。 +本样例基于AddCustom算子工程,介绍了设置L2 CacheMode的方法以及其影响场景。 ## 目录结构介绍 ``` -├── 15_mata_address_conflict // 同地址冲突样例工程目录 -│ ├── AclNNInvocation // 通过单算子API调用的方式调用AddsCustom算子 -│ ├── AddsCustom // AddsCustom算子工程 -│ ├── AddsCustom.json // AddsCustom算子的原型定义json文件 +├── l2_cache_bypass // L2 CacheMode样例工程目录 +│ ├── AclNNInvocation // 通过单算子API调用的方式调用AddCustom算子 +│ ├── AddCustom // AddCustom算子工程 +│ ├── AddCustom.json // AddCustom算子的原型定义json文件 │ └── install.sh // 脚本,调用msOpGen生成自定义算子工程,并编译 ``` ## 算子描述 -Adds算子实现了一个Tensor与标量值2.0相加,返回相加结果的功能。对应的数学表达式为: +Add算子实现了两个Shape不相同的Tensor相加,返回相加结果的功能。对应的数学表达式为: ``` -z = x + 2.0 +z = x + y ``` -本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在global memory的数据访问中,数据访问请求(读/写)在芯片内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,芯片会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 - -当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 +本样例主要介绍数据搬运中设置合理CacheMode对搬运效率的影响,在Global Memory的数据访问中,如果数据只需要访问一次,后续不需要重复读取,那么这种场景下可以设置Global Memory的CacheMode为CACHE_MODE_DISABLED,在这种模式下数据访问将不经过L2 Cache,避免影响需要重复访问的数据,从而提升数据访问效率。 +本样例中共有2个实现版本: +add_custom_v1.h:基础实现版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算; +add_custom_v2.h:在基础实现版本,设置了y/z的eCacheMod为CACHE_MODE_DISABLED; ## 算子规格描述 - + - - + + + - + - +
算子类型(OpType)Adds
算子类型(OpType)Add
算子输入nameshapedata typeformat
x8192 * 128floatND
算子输入nameshapedata typeformat
x5120 * 5120floatND
y5120 * 15360floatND
算子输出z8192 * 128floatND
算子输出z5120 * 15360floatND
核函数名adds_custom
核函数名add_custom
## 支持的产品型号 @@ -46,17 +48,17 @@ z = x + 2.0 ## 算子工程介绍 -其中,算子工程目录AddsCustom包含算子的实现文件,如下所示: +其中,算子工程目录AddCustom包含算子的实现文件,如下所示: ``` -├── AddsCustom // AddsCustom自定义算子工程 +├── AddCustom // AddCustom自定义算子工程 │ ├── op_host // host侧实现文件 │ └── op_kernel // kernel侧实现文件 ``` -CANN软件包中提供了工程创建工具msOpGen,AddsCustom算子工程可通过AddsCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 +CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通过AddCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 -创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在AddsCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 +创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在AddCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 备注:CustomOp目录为生成目录,每次执行install.sh脚本都会删除该目录并重新生成,切勿在该目录下编码算子,会存在丢失风险。 @@ -82,7 +84,7 @@ CANN软件包中提供了工程创建工具msOpGen,AddsCustom算子工程可 ```bash # 若开发者以git命令行方式clone了master分支代码,并切换目录 - cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/ ``` - 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 @@ -122,8 +124,6 @@ CANN软件包中提供了工程创建工具msOpGen,AddsCustom算子工程可 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 - 备注:如果要使用dump调试功能,需要移除op_host内的Atlas 训练系列产品、Atlas 200/500 A2 推理产品的配置项。 - ### 3. 部署自定义算子包 - 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH @@ -151,11 +151,11 @@ CANN软件包中提供了工程创建工具msOpGen,AddsCustom算子工程可 ### 4. 调用执行算子工程 -- [单算子API调用AddsCustom算子工程](./AclNNInvocation/README.md) +- [单算子API调用AddCustom算子工程](./AclNNInvocation/README.md) ## 更新说明 | 时间 | 更新事项 | | ---------- | -------- | -| 2025/07/03 | 新增样例 | +| 2025/07/14 | 新增样例 | -- Gitee From adfae8d3ac96831ace9dc187fc7f20cc83588952 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 17:09:20 +0800 Subject: [PATCH 05/19] remove useless blank --- .../l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h | 1 - .../l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h | 1 - 2 files changed, 2 deletions(-) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h index 5111e4c4c..7770313ea 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h @@ -90,7 +90,6 @@ private: static constexpr uint32_t TILE_M = 64; static constexpr uint32_t TILE_N = 128; - AscendC::TPipe pipe; AscendC::TQue inQueueX; AscendC::TQue inQueueY; diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h index 0eee75412..2fc194906 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h @@ -93,7 +93,6 @@ private: static constexpr uint32_t TILE_M = 64; static constexpr uint32_t TILE_N = 128; - AscendC::TPipe pipe; AscendC::TQue inQueueX; AscendC::TQue inQueueY; -- Gitee From c2c7e7dcc9956514c9fd9e40618e0741300cd3e3 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 17:14:04 +0800 Subject: [PATCH 06/19] fix read me --- .../12_cachemiss_preload_dcci/l2_cache_bypass/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md index 872679a07..05795de39 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md @@ -21,9 +21,10 @@ z = x + y ``` 本样例主要介绍数据搬运中设置合理CacheMode对搬运效率的影响,在Global Memory的数据访问中,如果数据只需要访问一次,后续不需要重复读取,那么这种场景下可以设置Global Memory的CacheMode为CACHE_MODE_DISABLED,在这种模式下数据访问将不经过L2 Cache,避免影响需要重复访问的数据,从而提升数据访问效率。 + 本样例中共有2个实现版本: -add_custom_v1.h:基础实现版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算; -add_custom_v2.h:在基础实现版本,设置了y/z的eCacheMod为CACHE_MODE_DISABLED; +add_custom_v1.h:基础版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算; +add_custom_v2.h:在add_custom_v1基础上,设置y/z的eCacheMod为CACHE_MODE_DISABLED,避免替换x的Cache数据影响搬运效率; ## 算子规格描述 -- Gitee From 12309b8eceb1e9926f7c7fda755d3e1a2077c5ec Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 17:16:03 +0800 Subject: [PATCH 07/19] fix readme --- .../12_cachemiss_preload_dcci/l2_cache_bypass/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md index 05795de39..1306a2f5c 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md @@ -26,6 +26,7 @@ z = x + y add_custom_v1.h:基础版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算; add_custom_v2.h:在add_custom_v1基础上,设置y/z的eCacheMod为CACHE_MODE_DISABLED,避免替换x的Cache数据影响搬运效率; + ## 算子规格描述 -- Gitee From 487f3eef5c0b7d83abfeaf03373e05cf0e4b2a52 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 17:19:13 +0800 Subject: [PATCH 08/19] fix read me --- .../12_cachemiss_preload_dcci/l2_cache_bypass/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md index 1306a2f5c..9d599f87d 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md @@ -23,7 +23,9 @@ z = x + y 本样例主要介绍数据搬运中设置合理CacheMode对搬运效率的影响,在Global Memory的数据访问中,如果数据只需要访问一次,后续不需要重复读取,那么这种场景下可以设置Global Memory的CacheMode为CACHE_MODE_DISABLED,在这种模式下数据访问将不经过L2 Cache,避免影响需要重复访问的数据,从而提升数据访问效率。 本样例中共有2个实现版本: + add_custom_v1.h:基础版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算; + add_custom_v2.h:在add_custom_v1基础上,设置y/z的eCacheMod为CACHE_MODE_DISABLED,避免替换x的Cache数据影响搬运效率; -- Gitee From bc65dc62ce19681d8c16c32b5c8e473875449503 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 17:20:24 +0800 Subject: [PATCH 09/19] add blank --- .../12_cachemiss_preload_dcci/l2_cache_bypass/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md index 9d599f87d..3663e3666 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md @@ -22,10 +22,8 @@ z = x + y 本样例主要介绍数据搬运中设置合理CacheMode对搬运效率的影响,在Global Memory的数据访问中,如果数据只需要访问一次,后续不需要重复读取,那么这种场景下可以设置Global Memory的CacheMode为CACHE_MODE_DISABLED,在这种模式下数据访问将不经过L2 Cache,避免影响需要重复访问的数据,从而提升数据访问效率。 -本样例中共有2个实现版本: - -add_custom_v1.h:基础版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算; - +本样例中共有2个实现版本: +add_custom_v1.h:基础版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算; add_custom_v2.h:在add_custom_v1基础上,设置y/z的eCacheMod为CACHE_MODE_DISABLED,避免替换x的Cache数据影响搬运效率; -- Gitee From 0a4599386142e1e1c614e103cfede76b7579ac8b Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 17:21:29 +0800 Subject: [PATCH 10/19] add blank --- .../4_best_practices/15_mata_address_conflict/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index 1ebba2146..db262bb9a 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -23,8 +23,8 @@ z = x + 2.0 本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在Global Memory的数据访问中,数据访问请求(读/写)在AI 处理器内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,AI 处理器会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 本样例中共有3个实现版本: adds_custom_v1.h:基础实现版本,每个核的计算顺序一致,存在同地址冲突,带宽效率较差 -adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 -adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 +adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 +adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 -- Gitee From 54ee4ef99431f8d488c30d89feacd11e09166c18 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 17:22:29 +0800 Subject: [PATCH 11/19] fix readme --- .../4_best_practices/15_mata_address_conflict/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index db262bb9a..d1a743254 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -21,10 +21,12 @@ z = x + 2.0 ``` 本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在Global Memory的数据访问中,数据访问请求(读/写)在AI 处理器内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,AI 处理器会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 + + 本样例中共有3个实现版本: adds_custom_v1.h:基础实现版本,每个核的计算顺序一致,存在同地址冲突,带宽效率较差 -adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 -adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 +adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 +adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 -- Gitee From 302dc367b2b85a02035d00fac762350e1cfcc4ba Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 17:23:41 +0800 Subject: [PATCH 12/19] fix typos --- .../4_best_practices/15_mata_address_conflict/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index d1a743254..b4ce4d2fa 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -22,11 +22,10 @@ z = x + 2.0 本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在Global Memory的数据访问中,数据访问请求(读/写)在AI 处理器内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,AI 处理器会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 - 本样例中共有3个实现版本: adds_custom_v1.h:基础实现版本,每个核的计算顺序一致,存在同地址冲突,带宽效率较差 -adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 -adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 +adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 +adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 -- Gitee From f99902ae838d0b63d8c659b2dad03027ae438981 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 14 Jul 2025 17:25:42 +0800 Subject: [PATCH 13/19] fix typos --- .../4_best_practices/15_mata_address_conflict/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index b4ce4d2fa..738e314a3 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -22,10 +22,10 @@ z = x + 2.0 本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在Global Memory的数据访问中,数据访问请求(读/写)在AI 处理器内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,AI 处理器会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 -本样例中共有3个实现版本: -adds_custom_v1.h:基础实现版本,每个核的计算顺序一致,存在同地址冲突,带宽效率较差 -adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 -adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 +本样例中共有3个实现版本: +adds_custom_v1.h:基础实现版本,每个核的计算顺序一致,存在同地址冲突,带宽效率较差 +adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 +adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 -- Gitee From 641e577d73faa11c92ef6ae9581facc7f76c3387 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Tue, 15 Jul 2025 11:10:27 +0800 Subject: [PATCH 14/19] fix sc --- .../l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h | 6 +++--- .../l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h index 7770313ea..52e0661fe 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h @@ -28,8 +28,8 @@ public: } __aicore__ inline void Process() { - for (int i = 0; i < tiling->loopOuter; i++) { - for (int j = 0; j < M_A / TILE_M; j++) { + for (uint32_t i = 0; i < tiling->loopOuter; i++) { + for (uint32_t j = 0; j < M_A / TILE_M; j++) { CopyIn(i, j); Compute(); CopyOut(i, j); @@ -38,7 +38,7 @@ public: } private: - __aicore__ inline void CopyIn(int32_t progressOuter, int32_t progressInner) + __aicore__ inline void CopyIn(uint32_t progressOuter, uint32_t progressInner) { AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h index 2fc194906..e0bf8eba8 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h @@ -31,8 +31,8 @@ public: } __aicore__ inline void Process() { - for (int i = 0; i < tiling->loopOuter; i++) { - for (int j = 0; j < M_A / TILE_M; j++) { + for (uint32_t i = 0; i < tiling->loopOuter; i++) { + for (uint32_t j = 0; j < M_A / TILE_M; j++) { CopyIn(i, j); Compute(); CopyOut(i, j); @@ -41,7 +41,7 @@ public: } private: - __aicore__ inline void CopyIn(int32_t progressOuter, int32_t progressInner) + __aicore__ inline void CopyIn(uint32_t progressOuter, uint32_t progressInner) { AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); -- Gitee From 01d266b76dc0004020909e11e539ca0ca2014ed1 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Tue, 15 Jul 2025 11:47:56 +0800 Subject: [PATCH 15/19] fix sc --- .../l2_cache_bypass/AddCustom/op_host/add_custom.cpp | 3 ++- .../l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h | 8 ++++---- .../l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h | 8 ++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp index 5e098faed..b9cb652e0 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp @@ -22,7 +22,8 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context) context->SetTilingKey(*caseId); AddCustomTilingData *tiling = context->GetTilingData(); - tiling->loopOuter = 3; + // x shape is [5120, 5120], y shape is [5120, 15360], so we set outer loop to 3 + tiling->loopOuter = 3U; // set workspace size size_t *currentWorkspace = context->GetWorkspaceSizes(1); diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h index 52e0661fe..086bca4f0 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h @@ -83,12 +83,12 @@ private: private: static constexpr int32_t BUFFER_NUM = 2; static constexpr int32_t BLOCK_SIZE = 32; - static constexpr uint32_t M_A = 128 * 40; + static constexpr uint32_t M_A = 5120U; static constexpr uint32_t N_A = M_A; static constexpr uint32_t M_B = M_A; - static constexpr uint32_t N_B = N_A * 3; - static constexpr uint32_t TILE_M = 64; - static constexpr uint32_t TILE_N = 128; + static constexpr uint32_t N_B = N_A * 3U; + static constexpr uint32_t TILE_M = 64U; + static constexpr uint32_t TILE_N = 128U; AscendC::TPipe pipe; AscendC::TQue inQueueX; diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h index e0bf8eba8..1f790e84d 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h @@ -86,12 +86,12 @@ private: private: static constexpr int32_t BUFFER_NUM = 2; static constexpr int32_t BLOCK_SIZE = 32; - static constexpr uint32_t M_A = 128 * 40; + static constexpr uint32_t M_A = 5120U; static constexpr uint32_t N_A = M_A; static constexpr uint32_t M_B = M_A; - static constexpr uint32_t N_B = N_A * 3; - static constexpr uint32_t TILE_M = 64; - static constexpr uint32_t TILE_N = 128; + static constexpr uint32_t N_B = N_A * 3U; + static constexpr uint32_t TILE_M = 64U; + static constexpr uint32_t TILE_N = 128U; AscendC::TPipe pipe; AscendC::TQue inQueueX; -- Gitee From 199d915983a779e9dc8c85a84f6b59ddf8d85a30 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Tue, 15 Jul 2025 19:11:07 +0800 Subject: [PATCH 16/19] fix read me --- .../12_cachemiss_preload_dcci/l2_cache_bypass/README.md | 7 +++---- .../4_best_practices/15_mata_address_conflict/README.md | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md index 3663e3666..f76d68d97 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md +++ b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md @@ -22,10 +22,9 @@ z = x + y 本样例主要介绍数据搬运中设置合理CacheMode对搬运效率的影响,在Global Memory的数据访问中,如果数据只需要访问一次,后续不需要重复读取,那么这种场景下可以设置Global Memory的CacheMode为CACHE_MODE_DISABLED,在这种模式下数据访问将不经过L2 Cache,避免影响需要重复访问的数据,从而提升数据访问效率。 -本样例中共有2个实现版本: -add_custom_v1.h:基础版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算; -add_custom_v2.h:在add_custom_v1基础上,设置y/z的eCacheMod为CACHE_MODE_DISABLED,避免替换x的Cache数据影响搬运效率; - +本样例中共有2个实现版本: +add_custom_v1.h:基础版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算。 +add_custom_v2.h:在add_custom_v1基础上,设置y/z的CacheMode为CACHE_MODE_DISABLED,避免替换已进入Cache的x数据,影响搬运效率。 ## 算子规格描述 diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index 738e314a3..bd20372ab 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -23,9 +23,9 @@ z = x + 2.0 本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在Global Memory的数据访问中,数据访问请求(读/写)在AI 处理器内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,AI 处理器会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 本样例中共有3个实现版本: -adds_custom_v1.h:基础实现版本,每个核的计算顺序一致,存在同地址冲突,带宽效率较差 -adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突 -adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突 +adds_custom_v1.h:基础实现版本,每个核的计算顺序一致,存在同地址冲突,带宽效率较差。 +adds_custom_v2.h:通过调整每个核的计算顺序,避免发生同地址冲突。 +adds_custom_v3.h:通过调整切分顺序,避免发生同地址冲突。 当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 -- Gitee From e0299088684596d7c10f20f6cc0b7de4a05fedf6 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Tue, 15 Jul 2025 19:28:55 +0800 Subject: [PATCH 17/19] rename folder name --- .../AclNNInvocation/README.md | 0 .../AclNNInvocation/inc/common.h | 0 .../AclNNInvocation/inc/op_runner.h | 0 .../AclNNInvocation/inc/operator_desc.h | 0 .../AclNNInvocation/run.sh | 0 .../AclNNInvocation/scripts/acl.json | 0 .../AclNNInvocation/scripts/gen_data.py | 0 .../AclNNInvocation/scripts/verify_result.py | 0 .../AclNNInvocation/src/CMakeLists.txt | 0 .../AclNNInvocation/src/common.cpp | 0 .../AclNNInvocation/src/main.cpp | 0 .../AclNNInvocation/src/op_runner.cpp | 0 .../AclNNInvocation/src/operator_desc.cpp | 0 .../l2_cache_bypass => 12_l2_cache_bypass}/AddCustom.json | 0 .../AddCustom/op_host/add_custom.cpp | 0 .../AddCustom/op_kernel/add_custom.cpp | 0 .../AddCustom/op_kernel/add_custom_tiling.h | 0 .../AddCustom/op_kernel/add_custom_v1.h | 0 .../AddCustom/op_kernel/add_custom_v2.h | 0 .../l2_cache_bypass => 12_l2_cache_bypass}/README.md | 2 +- .../l2_cache_bypass => 12_l2_cache_bypass}/install.sh | 0 operator/ascendc/4_best_practices/README.md | 3 +++ 22 files changed, 4 insertions(+), 1 deletion(-) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/README.md (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/inc/common.h (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/inc/op_runner.h (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/inc/operator_desc.h (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/run.sh (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/scripts/acl.json (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/scripts/gen_data.py (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/scripts/verify_result.py (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/src/CMakeLists.txt (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/src/common.cpp (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/src/main.cpp (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/src/op_runner.cpp (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AclNNInvocation/src/operator_desc.cpp (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AddCustom.json (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AddCustom/op_host/add_custom.cpp (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AddCustom/op_kernel/add_custom.cpp (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AddCustom/op_kernel/add_custom_tiling.h (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AddCustom/op_kernel/add_custom_v1.h (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/AddCustom/op_kernel/add_custom_v2.h (100%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/README.md (99%) rename operator/ascendc/4_best_practices/{12_cachemiss_preload_dcci/l2_cache_bypass => 12_l2_cache_bypass}/install.sh (100%) diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/README.md rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/common.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/common.h similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/common.h rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/common.h diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/op_runner.h similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/op_runner.h rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/op_runner.h diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/operator_desc.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/operator_desc.h similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/inc/operator_desc.h rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/operator_desc.h diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/run.sh similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/run.sh rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/run.sh diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/acl.json b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/acl.json similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/acl.json rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/acl.json diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/gen_data.py similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/gen_data.py rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/gen_data.py diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/verify_result.py similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/scripts/verify_result.py rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/verify_result.py diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/common.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/common.cpp similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/common.cpp rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/common.cpp diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/main.cpp similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/main.cpp rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/main.cpp diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/op_runner.cpp similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/op_runner.cpp rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/op_runner.cpp diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom.json b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom.json similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom.json rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom.json diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_host/add_custom.cpp similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_host/add_custom.cpp rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_host/add_custom.cpp diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md similarity index 99% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md index f76d68d97..f40962836 100644 --- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/README.md +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md @@ -22,7 +22,7 @@ z = x + y 本样例主要介绍数据搬运中设置合理CacheMode对搬运效率的影响,在Global Memory的数据访问中,如果数据只需要访问一次,后续不需要重复读取,那么这种场景下可以设置Global Memory的CacheMode为CACHE_MODE_DISABLED,在这种模式下数据访问将不经过L2 Cache,避免影响需要重复访问的数据,从而提升数据访问效率。 -本样例中共有2个实现版本: +本样例中共有2个实现版本: add_custom_v1.h:基础版本,从列方向切分,每个核计算5120×128的数据量,共有40个核参与计算。 add_custom_v2.h:在add_custom_v1基础上,设置y/z的CacheMode为CACHE_MODE_DISABLED,避免替换已进入Cache的x数据,影响搬运效率。 diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/install.sh b/operator/ascendc/4_best_practices/12_l2_cache_bypass/install.sh similarity index 100% rename from operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/install.sh rename to operator/ascendc/4_best_practices/12_l2_cache_bypass/install.sh diff --git a/operator/ascendc/4_best_practices/README.md b/operator/ascendc/4_best_practices/README.md index c40fe61a7..a7dbe1f9e 100644 --- a/operator/ascendc/4_best_practices/README.md +++ b/operator/ascendc/4_best_practices/README.md @@ -8,6 +8,7 @@ | ------------------------------- | ------------------------------------------ | ------------------------------------------ | | [4_bank_conflict](./4_bank_conflict) | 基于Ascend C的bank冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [6_group_matmul](./6_group_matmul) | 基于Ascend C的group matmul算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | +| [12_l2_cache_bypass](./12_l2_cache_bypass) | 基于Ascend C的L2 CaCheMode算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [15_mata_address_conflict](./15_mata_address_conflict) | 基于Ascend C的同地址冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [21_all_gather_matmul_custom](./21_all_gather_matmul_custom) | 基于Ascend C的AllGatherMatmul算子性能调优样例 | Atlas A2训练系列产品 | | [22_matmul_reduce_scatter_custom](./22_matmul_reduce_scatter_custom) | 基于Ascend C的MatmulReduceScatter算子性能调优样例 | Atlas A2训练系列产品 | @@ -45,6 +46,8 @@ ## 更新说明 | 时间 | 更新事项 | | ---------- | -------------------------------------------- | +| 2025/07/14 | 新增l2_cache_bypass样例 | +| 2025/07/03 | 新增15_mata_address_conflict样例 | | 2025/07/01 | 新增4_bank_conflict样例 | | 2024/12/19 | 新增23_matmul_all_reduce_custom样例 | | 2024/12/19 | 新增22_matmul_reduce_scatter_custom样例 | -- Gitee From d8ee3292c92edb4fffdabcf205bb9b9d48c4296f Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Tue, 15 Jul 2025 19:33:33 +0800 Subject: [PATCH 18/19] fix case name --- operator/ascendc/4_best_practices/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/4_best_practices/README.md b/operator/ascendc/4_best_practices/README.md index a7dbe1f9e..926e4a6ef 100644 --- a/operator/ascendc/4_best_practices/README.md +++ b/operator/ascendc/4_best_practices/README.md @@ -46,7 +46,7 @@ ## 更新说明 | 时间 | 更新事项 | | ---------- | -------------------------------------------- | -| 2025/07/14 | 新增l2_cache_bypass样例 | +| 2025/07/14 | 新增12_l2_cache_bypass样例 | | 2025/07/03 | 新增15_mata_address_conflict样例 | | 2025/07/01 | 新增4_bank_conflict样例 | | 2024/12/19 | 新增23_matmul_all_reduce_custom样例 | -- Gitee From 2438d102f075720c89b6173b7faf0e521dfcc4cd Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Tue, 15 Jul 2025 19:37:53 +0800 Subject: [PATCH 19/19] fix folder path --- .../12_l2_cache_bypass/AclNNInvocation/README.md | 2 +- operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md index d9b357205..d3e63bedf 100644 --- a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md @@ -58,7 +58,7 @@ aclnnStatus aclnnAddCustom( 以命令行方式下载样例代码,master分支为例。 ```bash - cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/AclNNInvocation + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation ``` - 样例执行 diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md index f40962836..22f239d00 100644 --- a/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md +++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md @@ -85,7 +85,7 @@ CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通 ```bash # 若开发者以git命令行方式clone了master分支代码,并切换目录 - cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/l2_cache_bypass/ + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_l2_cache_bypass/ ``` - 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 -- Gitee