From 457fae79ccf32399bed697ec55cccff3fb6f2c6e Mon Sep 17 00:00:00 2001 From: m00848753 Date: Fri, 8 Aug 2025 11:51:45 +0800 Subject: [PATCH] testL2Cache --- .../add_custom.cpp | 121 ++++++++++++++---- .../AddKernelInvocationTilingNeo/main.cpp | 6 +- 2 files changed, 101 insertions(+), 26 deletions(-) diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp index 1b992dd86..8655c2aff 100644 --- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp +++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp @@ -1,5 +1,5 @@ /** - * @file add_custom.cpp + * @file add_custom.cpp* * * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. * @@ -10,27 +10,83 @@ #include "add_custom_tiling.h" #include "kernel_operator.h" -constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TOTAL_LENGTH = 8*2048; +constexpr int32_t USE_CORE_NUM = 16; +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; +constexpr int32_t TILE_NUM = 16; +constexpr int32_t BUFFER_NUM = 2; +constexpr int32_t TILE_LENGTH = 32 * 1024; // separate to 2 parts, due to double buffer +constexpr int32_t MTE_LEN = TILE_LENGTH / 32; // separate to 2 parts, due to double buffer +constexpr int32_t DATATimes = 1.5; //Load Data Times +constexpr int32_t Core_Num = 48; //Core Num +constexpr int32_t L2Size = 192 * 1024 * 1024; +constexpr int32_t ReadDataSize = 128 * 1024 * 1024; +constexpr int32_t CORE_LENGTH = (L2Size * DATATimes) / Core_Num; + +//实验3/4通过调整DataTimes即可 +//实验5/6在预处理阶段,添加类似的for循环即可 class KernelAdd { public: __aicore__ inline KernelAdd() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) { - this->blockLength = totalLength / AscendC::GetBlockNum(); - this->tileNum = tileNum; - this->tileLength = this->blockLength / tileNum / BUFFER_NUM; - xGm.SetGlobalBuffer((__gm__ half *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - yGm.SetGlobalBuffer((__gm__ half *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - zGm.SetGlobalBuffer((__gm__ half *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(half)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(half)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(half)); + __ubuf__ float* xLocal = (__ubuf__ float *)get_imm(0); + + /*预处理开始*/ + //每个核单次搬运量为TILE_LENGTH + //要求搬运总数据量为192*1024*1024*DATATimes + //因此搬运的次数为 + int64_t loop_num = CORE_LENGTH/TILE_LENGTH; + for (int32_t i = 0; i < loop_num; i++) { + copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0); + } + /*对于实验5和实验6,需要测试数据的替换策略*/ + //读取4次前128MB + // int64_t loop_num1 = 128*1024*1024/TILE_LENGTH/Core_Num; + // int64_t ReadTotalNum = 4l + // for (int64_t ReadTimes = 0; ReadTimes < ReadTotalNum; ReadTimes++) { + // for (int32_t i = 0; i < loop_num; i++) { + // copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0); + // } + // } + // 读取一次128MB~128MB*4 + // for (int32_t i = 0; i < loop_num; i++) { + // copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0); + // } + + // 准备测试参数 + int64_t loop_num_real = ReadDataSize/TILE_LENGTH/Core_Num; + pipe_barrier(PIPE_ALL); + int64_t systemCycleBefore = Ascendc::GetSystemCycle(); + + //开始测试 + //循环1:前面的数据 + for (int32_t i = 0; i < loop_num_real; i++) { + copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0); + } + //循环2:后面的数据 + // for (int32_t i = loop_num - loop_num_real; i < loop_num; i++) { + // copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0); + // } + pipe_barrier(PIPE_ALL); + + //开始统计数据 + int64_t systemCycleAfter = Ascendc::GetSystemCycle(); + int64_t GetBlockNumCycle = systemCycleAfter - systemCycleBefore; + int64_t CycleToTimeBase = 50; + int64_t GetBlockNumTime = GetBlockNumCycle * 20; + if (AscendC::GetBlockIdx() < 1) { + AscendC::PRINTF("block idx %d\n", AscendC::GetBlockIdx()); + AscendC::PRINTF("tile num %d\n", TILE_LENGTH); + AscendC::PRINTF("kernel cycle %d\n", GetBlockNumTime); + AscendC::PRINTF("bandwidth(GB/s) %f\n", float(TILE_LENGTH*loop_num_real*48)/GetBlockNumTime); + } } __aicore__ inline void Process() { - int32_t loopCount = this->tileNum * BUFFER_NUM; - for (int32_t i = 0; i < loopCount; i++) { + int32_t loopCount = this->TILE_NUM * BUFFER_NUM; + for (int32_t i = 0; i < 1; i++) { CopyIn(i); Compute(i); CopyOut(i); @@ -42,8 +98,8 @@ private: { AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); - AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); - AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); + AscendC::DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + AscendC::DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } @@ -52,7 +108,21 @@ private: AscendC::LocalTensor xLocal = inQueueX.DeQue(); AscendC::LocalTensor yLocal = inQueueY.DeQue(); AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); + pipe_barrier(PIPE_ALL); + int64_t systemCycleBefore = Ascendc::GetSystemCycle(); + for (int32_t i = 0; i < 256; i++) { + // AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // for (int32_t i = 0; i < loop_num_real; i++) { + // copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0); + // } + } + pipe_barrier(PIPE_ALL); + int64_t systemCycleAfter = Ascendc::GetSystemCycle(); + int64_t GetBlockNumCycle = systemCycleAfter - systemCycleBefore; + int64_t CycleToTimeBase = 50; + int64_t GetBlockNumTime = GetBlockNumCycle * 20; + AscendC::PRINTF("tile num %d\n", TILE_LENGTH); + AscendC::PRINTF("kernel cycle %d\n", GetBlockNumTime); outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); @@ -60,7 +130,7 @@ private: __aicore__ inline void CopyOut(int32_t progress) { AscendC::LocalTensor zLocal = outQueueZ.DeQue(); - AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); + AscendC::DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); outQueueZ.FreeTensor(zLocal); } @@ -71,14 +141,19 @@ private: AscendC::GlobalTensor xGm; AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; - uint32_t blockLength; - uint32_t tileNum; - uint32_t tileLength; }; extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling) { KernelAdd op; - op.Init(x, y, z, tiling.totalLength, tiling.tileNum); - op.Process(); + #ifdef __DEV_C220_VEC__ + op.Init(x, y, z); + #endif + // op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); } diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/main.cpp b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/main.cpp index 527b07f74..b29ed3a95 100644 --- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/main.cpp +++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/main.cpp @@ -19,10 +19,10 @@ extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z int32_t main(int32_t argc, char *argv[]) { - uint32_t blockDim = 8; + uint32_t blockDim = 48; size_t tilingSize = 2 * sizeof(uint32_t); - size_t inputByteSize = 8 * 2048 * sizeof(uint16_t); - size_t outputByteSize = 8 * 2048 * sizeof(uint16_t); + size_t inputByteSize = 300 * 1024 * 1024 * 5; + size_t outputByteSize = 300 * 1024 * 1024 * 5; #ifdef ASCENDC_CPU_DEBUG uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); -- Gitee