diff --git a/atvc/examples/addcmul/README.md b/atvc/examples/addcmul/README.md deleted file mode 100644 index 43e9b2b656d028e33247164bc289ed004b8dbd4e..0000000000000000000000000000000000000000 --- a/atvc/examples/addcmul/README.md +++ /dev/null @@ -1,48 +0,0 @@ - - -## 概述 - -本样例介绍了利用ATVC实现Addcmul单算子并完成功能验证 - - -## 样例支持产品型号: -- Atlas A2训练系列产品 - -## 目录结构 - -| 文件名 | 描述 | -| ------------------------------------------------------------ | ------------------------------------------------------------ | -| [addcmul.cpp](./addcmul.cpp) | Addcmul算子代码实现以及调用样例 | - -## 算子描述 - -Add算子数学计算公式:$output_i = input_i + value * tensor1_i * tensor2_i$ - -Add算子规格: - - - - - - - - - - - - - - - -
算子类型(OpType)Add
算子输入
nameshapedata typeformat
input8 * 2048floatND
tensor11 * 2048floatND
tensor21 * 2048floatND
算子输出
output8 * 2048floatND
核函数名AddcmulCustom
- -## 算子运行 -在ascendc-api-adv代码仓目录下执行: -```bash -$ cd ./atvc/tests/ -$ bash run_test.sh addcmul -... -Generate golden data successfully. -... -Accuracy verification passed. -``` \ No newline at end of file diff --git a/atvc/examples/addcmul/addcmul.cpp b/atvc/examples/addcmul/addcmul.cpp deleted file mode 100644 index 2ccd77a0d121ea7511b9d2417bde985d59a949da..0000000000000000000000000000000000000000 --- a/atvc/examples/addcmul/addcmul.cpp +++ /dev/null @@ -1,166 +0,0 @@ -/** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. - * - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -#include -#include -#include -#include -#include -#include -#include "acl/acl.h" -#include "broadcast/broadcast_host.h" -#include "addcmul.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) - -namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) { - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - -// AddWithBroadcast算子的描述:两个输入,一个输出,类型均为float -using BroadcastOpTraits = ATVC::OpTraits, ATVC::OpOutputs, ATVC::OpTemps>; - -// 负责Broadcast类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 -template -void BroadcastOpAdapter(uint8_t* tensor1, uint8_t* tensor2, float value, uint8_t* input, uint8_t* output, ATVC::BroadcastParam ¶m, ATVC::BroadcastPolicy &policy, aclrtStream& stream) -{ - // 申请临时空间workspace,并将其与BroadcastTilingData一同传到Device侧 - uint8_t *workspaceDevice; - CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); - param.workspaceAddr = reinterpret_cast(workspaceDevice); - // 将tiling api计算出的BroadcastPolicy转化为编译态参数并实例化相应的核函数 - if (policy == ATVC::BROADCAST_POLICY0) { - AddcmulCustom<<>>(tensor1, tensor2, input, output, param, value); - }else if (policy == ATVC::BROADCAST_POLICY1) { - AddcmulCustom<<>>(tensor1, tensor2, input, output, param, value); - } else { - printf("[ERROR] Cannot find any matched policy.\n"); - } - // 流同步后释放申请的param内存 - CHECK_ACL(aclrtSynchronizeStream(stream)); - CHECK_ACL(aclrtFree(workspaceDevice)); -} - -bool VerifyResults(const std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} -} - -int32_t main(int32_t argc, char* argv[]) -{ - int32_t eleNum = 1 * 8; - int32_t outEleNum = 8 * 8; - std::vector shapeIn{1, 8}; // 测试输入shape - std::vector shapeOut{8, 8}; // 测试输入shape - - size_t inputByteSize = static_cast(eleNum) * sizeof(float); - size_t outputByteSize = static_cast(outEleNum) * sizeof(float); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution disX(1.0f, 9.0f); - std::uniform_real_distribution disY(1.0f, 9.0f); - - std::vector tensor1(eleNum); - std::vector tensor2(eleNum); - float value = 4; - std::vector input(outEleNum); - std::vector golden(outEleNum); - for (int i = 0; i < eleNum; ++i) { - tensor1[i] = (disX(gen)); - tensor2[i] = (disX(gen)); - } - for (int i = 0; i < outEleNum; ++i) { - input[i] = (disY(gen)); - } - for (int i = 0; i < outEleNum; ++i) { - golden[i] = input[i] + (tensor1[i % eleNum] * tensor2[i % eleNum] * value); - } - printf("Generate golden data successfully.\n"); - // 初始化Acl资源 - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - uint8_t *outputHost; - uint8_t *tensor1Device; - uint8_t *tensor2Device; - uint8_t *inputDevice; - uint8_t *outputDevice; - - CHECK_ACL(aclrtMallocHost((void **)(&outputHost), outputByteSize)); - CHECK_ACL(aclrtMalloc((void **)&tensor1Device, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void **)&tensor2Device, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void **)&inputDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void **)&outputDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - - CHECK_ACL(aclrtMemcpy(tensor1Device, inputByteSize, tensor1.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); - CHECK_ACL(aclrtMemcpy(tensor2Device, inputByteSize, tensor2.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); - CHECK_ACL(aclrtMemcpy(inputDevice, outputByteSize, input.data(), outputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); - - ATVC::BroadcastParam param; // Broadcast运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::BroadcastPolicy policy = {-1, -1, -1}; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 - // Host侧调用Tiling API完成相关运行态参数的运算 - param.nBufferNum = 1; - if (!ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, ¶m)) { - printf("Broadcast tiling error.\n"); - return -1; - }; - // 调用Adapter调度接口,完成核函数的模板调用 - BroadcastOpAdapter(tensor1Device, tensor2Device, value, inputDevice, outputDevice, param, policy, stream); - - CHECK_ACL(aclrtMemcpy(outputHost, outputByteSize, outputDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); - std::vector output(reinterpret_cast(outputHost), reinterpret_cast(outputHost) + outEleNum); - - // 释放Acl资源 - CHECK_ACL(aclrtFree(tensor1Device)); - CHECK_ACL(aclrtFree(tensor2Device)); - CHECK_ACL(aclrtFree(inputDevice)); - CHECK_ACL(aclrtFree(outputDevice)); - CHECK_ACL(aclrtFreeHost(outputHost)); - - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - - if (!VerifyResults(golden, output)) { - return -1; - } - printf("Accuracy verification passed.\n"); - return 0; -} diff --git a/atvc/examples/addcmul/addcmul.h b/atvc/examples/addcmul/addcmul.h deleted file mode 100644 index a63479d4c21e4605304072c4a0b2c98f544d095d..0000000000000000000000000000000000000000 --- a/atvc/examples/addcmul/addcmul.h +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. - * - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -#ifndef ATVC_ADDCMUL_H -#define ATVC_ADDCMUL_H -#include "pre_compute_mul_of_broadcast.h" -#include "post_compute_add_of_broadcast.h" - -/* ! - * \brief Addcmul(i) = input(i) + value * tensor1(i) * tensor2(i) - * \param [in] tensor1, input global memory of tensor1 - * \param [in] tensor2, input global memory of tensor2 - * \param [in] input, input global memory of input - * \param [out] output, output global memory - * \param [in] broadcastParam, params of broadcast - * \param [in] value, input value - */ -template::Type> -__global__ __aicore__ void AddcmulCustom(GM_ADDR tensor1, - GM_ADDR tensor2, - GM_ADDR input, - GM_ADDR output, - ATVC::BroadcastParam broadcastParam, - DataType value) -{ - KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); - - // 1. get input and output for kernel op from host Traits - using KernelOpIn = typename Traits::In::types; - using KernelOpOut = typename Traits::Out::types; - using KernelOpTemp = typename Traits::Temp::types; - - // 2. define input and output for pre compute - using PreComputeInput = ATVC::OpInputs::Type, typename ATVC::TypeListGet::Type>; - using PreComputeOutput = ATVC::OpOutputs::Type>; - using PreComputeTemp = ATVC::OpOutputs::Type>; - using PreComputeOpTraits = ATVC::OpTraits; - using PreCompute = PreComputeMulOfBroadcast; - - // 3. define input and output for broadcast - using BroadcastOpInput = ATVC::OpInputs::Type>; - using BroadcastOpOutput = ATVC::OpOutputs::Type>; - using BroadcastOpTraits = ATVC::OpTraits; - - // 4. define input and output for post compute - using PostComputeInput = ATVC::OpInputs::Type, typename ATVC::TypeListGet::Type>; - using PostComputeOutput = ATVC::OpOutputs::Type>; - using PostComputeOpTraits = ATVC::OpTraits; - using PostCompute = PostComputeAddOfBroadcast; - - // 5. call op run - auto op = ATVC::Kernel::BroadcastOpTemplate, Policy, PreCompute, PostCompute>(); - ATVC::BroadcastParam *param = &broadcastParam; - op.Run(tensor1, tensor2, input, output, param, value); -} -#endif diff --git a/atvc/examples/addcmul/post_compute_add_of_broadcast.h b/atvc/examples/addcmul/post_compute_add_of_broadcast.h deleted file mode 100644 index 9059186cb36583d1208382e5e3085fa6c8ddc2bc..0000000000000000000000000000000000000000 --- a/atvc/examples/addcmul/post_compute_add_of_broadcast.h +++ /dev/null @@ -1 +0,0 @@ -../add_with_broadcast/post_compute_add_of_broadcast.h \ No newline at end of file diff --git a/atvc/examples/addcmul/pre_compute_mul_of_broadcast.h b/atvc/examples/addcmul/pre_compute_mul_of_broadcast.h deleted file mode 100644 index 6df76098319d82cffc55eb6d601bca9d93a73e52..0000000000000000000000000000000000000000 --- a/atvc/examples/addcmul/pre_compute_mul_of_broadcast.h +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. - * - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -#ifndef ATVC_PRE_COMPUTE_MUL_OF_BROADCAST_H -#define ATVC_PRE_COMPUTE_MUL_OF_BROADCAST_H - -#include "broadcast/broadcast_device.h" - -template -struct PreComputeMulOfBroadcast { - using Inputs = typename Traits::In::types; - using Outputs = typename Traits::Out::types; - using DataType = typename ATVC::TypeListGet::Type; - static constexpr size_t INPUT_SIZE = ATVC::TypeListSize::VALUE; - - template - __aicore__ inline void SetParam(DataType value) - { - value_ = value; - } - - template - __aicore__ inline void SetArgs(Args... args) - { - InitArgsInput<0>(args...); - } - - template - __aicore__ inline void operator()(AscendC::LocalTensor tensor1, AscendC::LocalTensor tensor2, AscendC::LocalTensor temp1, AscendC::LocalTensor temp2, - uint32_t copyInOffset, AscendC::DataCopyExtParams ©InParams) - { - size_t size = copyInParams.blockCount * (copyInParams.blockLen + copyInParams.srcStride * 32) / sizeof(DataType); - ATVC::SyncDataQueue(); - - CopyIn(tensor1, tensor2, copyInOffset, copyInParams); - - AscendC::PipeBarrier(); // wait broadcast finished - ATVC::SyncDataQueue(); - - AscendC::Mul(temp1, tensor1, tensor2, size); - AscendC::Muls(temp2, temp1, value_, size); - } - -private: - template - __aicore__ inline void InitArgsInput(GM_ADDR x, Args... args) - { - input_[start].SetGlobalBuffer((__gm__ DataType*)x); - if constexpr (start + 1 < INPUT_SIZE) { - InitArgsInput(args...); - } - } - - template - __aicore__ inline void CopyIn(AscendC::LocalTensor tensor1, AscendC::LocalTensor tensor2, uint32_t copyInOffset, AscendC::DataCopyExtParams ©Inarams) - { - AscendC::DataCopyPadExtParams padParams{false, 0, 0, 0}; - AscendC::DataCopyPad(tensor1, input_[0][copyInOffset], copyInarams, padParams); - AscendC::DataCopyPad(tensor2, input_[1][copyInOffset], copyInarams, padParams); - } - - AscendC::GlobalTensor input_[INPUT_SIZE]; - DataType value_; -}; - -#endif