From 457fae79ccf32399bed697ec55cccff3fb6f2c6e Mon Sep 17 00:00:00 2001
From: m00848753 <mayafei4@huawei.com>
Date: Fri, 8 Aug 2025 11:51:45 +0800
Subject: [PATCH] testL2Cache

---
 .../add_custom.cpp                            | 121 ++++++++++++++----
 .../AddKernelInvocationTilingNeo/main.cpp     |   6 +-
 2 files changed, 101 insertions(+), 26 deletions(-)

diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp
index 1b992dd86..8655c2aff 100644
--- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp
@@ -1,5 +1,5 @@
 /**
- * @file add_custom.cpp
+ * @file add_custom.cpp*
  *
  * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
  *
@@ -10,27 +10,83 @@
 #include "add_custom_tiling.h"
 #include "kernel_operator.h"
 
-constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
+constexpr int32_t TOTAL_LENGTH = 8*2048;
+constexpr int32_t USE_CORE_NUM = 16;
+constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;
+constexpr int32_t TILE_NUM = 16;
+constexpr int32_t BUFFER_NUM = 2;
+constexpr int32_t TILE_LENGTH = 32 * 1024;  // separate to 2 parts, due to double buffer
+constexpr int32_t MTE_LEN = TILE_LENGTH / 32; // separate to 2 parts, due to double buffer
+constexpr int32_t DATATimes = 1.5;   //Load Data Times
+constexpr int32_t Core_Num = 48;  //Core Num
+constexpr int32_t L2Size = 192 * 1024 * 1024;
+constexpr int32_t ReadDataSize = 128 * 1024 * 1024;
+constexpr int32_t CORE_LENGTH = (L2Size * DATATimes) / Core_Num;
+
+//实验3/4通过调整DataTimes即可
+//实验5/6在预处理阶段，添加类似的for循环即可
 
 class KernelAdd {
 public:
     __aicore__ inline KernelAdd() {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
     {
-        this->blockLength = totalLength / AscendC::GetBlockNum();
-        this->tileNum = tileNum;
-        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
-        xGm.SetGlobalBuffer((__gm__ half *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-        yGm.SetGlobalBuffer((__gm__ half *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-        zGm.SetGlobalBuffer((__gm__ half *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(half));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(half));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(half));
+        __ubuf__  float* xLocal = (__ubuf__ float *)get_imm(0);
+
+        /*预处理开始*/
+        //每个核单次搬运量为TILE_LENGTH
+        //要求搬运总数据量为192*1024*1024*DATATimes
+        //因此搬运的次数为
+        int64_t loop_num =  CORE_LENGTH/TILE_LENGTH;
+        for (int32_t i = 0; i < loop_num; i++) {
+            copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0);
+        }
+        /*对于实验5和实验6，需要测试数据的替换策略*/
+        //读取4次前128MB
+        // int64_t loop_num1 = 128*1024*1024/TILE_LENGTH/Core_Num;
+        // int64_t ReadTotalNum = 4l
+        // for (int64_t ReadTimes = 0; ReadTimes < ReadTotalNum; ReadTimes++) {
+        //     for (int32_t i = 0; i < loop_num; i++) {
+        //         copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0);
+        //     }
+        // }
+        // 读取一次128MB~128MB*4
+        // for (int32_t i = 0; i < loop_num; i++) {
+        //     copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0);
+        // }
+
+        // 准备测试参数
+        int64_t loop_num_real = ReadDataSize/TILE_LENGTH/Core_Num;
+        pipe_barrier(PIPE_ALL);
+        int64_t systemCycleBefore = Ascendc::GetSystemCycle();
+
+        //开始测试
+        //循环1：前面的数据
+        for (int32_t i = 0; i < loop_num_real; i++) {
+            copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0);
+        }
+        //循环2：后面的数据
+        // for (int32_t i = loop_num - loop_num_real; i < loop_num; i++) {
+        //     copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0);
+        // }
+        pipe_barrier(PIPE_ALL);
+
+        //开始统计数据
+        int64_t systemCycleAfter = Ascendc::GetSystemCycle();
+        int64_t GetBlockNumCycle = systemCycleAfter - systemCycleBefore;
+        int64_t CycleToTimeBase = 50;
+        int64_t GetBlockNumTime = GetBlockNumCycle * 20;
+        if (AscendC::GetBlockIdx() < 1) {
+            AscendC::PRINTF("block idx %d\n", AscendC::GetBlockIdx());
+            AscendC::PRINTF("tile num %d\n", TILE_LENGTH);
+            AscendC::PRINTF("kernel cycle %d\n", GetBlockNumTime);
+            AscendC::PRINTF("bandwidth(GB/s) %f\n", float(TILE_LENGTH*loop_num_real*48)/GetBlockNumTime);
+        }
     }
     __aicore__ inline void Process()
     {
-        int32_t loopCount = this->tileNum * BUFFER_NUM;
-        for (int32_t i = 0; i < loopCount; i++) {
+        int32_t loopCount = this->TILE_NUM * BUFFER_NUM;
+        for (int32_t i = 0; i < 1; i++) {
             CopyIn(i);
             Compute(i);
             CopyOut(i);
@@ -42,8 +98,8 @@ private:
     {
         AscendC::LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
         AscendC::LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
-        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
-        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        AscendC::DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
+        AscendC::DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
         inQueueX.EnQue(xLocal);
         inQueueY.EnQue(yLocal);
     }
@@ -52,7 +108,21 @@ private:
         AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>();
         AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>();
         AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
-        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
+        pipe_barrier(PIPE_ALL);
+        int64_t systemCycleBefore = Ascendc::GetSystemCycle();
+        for (int32_t i = 0; i < 256; i++) {
+            // AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+            // for (int32_t i = 0; i < loop_num_real; i++) {
+            //     copy_gm_to_ubuf(((__ubuf__ int8_t *)xLocal),((__gm__ int8_t *)x + (int64_t)(AscendC::GetBlockIdx()*CORE_LENGTH) + (int64_t)(i*TILE_LENGTH)), 0, 1, (uint16_t)MTE_LEN, 0, 0);
+            // }
+        }
+        pipe_barrier(PIPE_ALL);
+        int64_t systemCycleAfter = Ascendc::GetSystemCycle();
+        int64_t GetBlockNumCycle = systemCycleAfter - systemCycleBefore;
+        int64_t CycleToTimeBase = 50;
+        int64_t GetBlockNumTime = GetBlockNumCycle * 20;
+        AscendC::PRINTF("tile num %d\n", TILE_LENGTH);
+        AscendC::PRINTF("kernel cycle %d\n", GetBlockNumTime);
         outQueueZ.EnQue<half>(zLocal);
         inQueueX.FreeTensor(xLocal);
         inQueueY.FreeTensor(yLocal);
@@ -60,7 +130,7 @@ private:
     __aicore__ inline void CopyOut(int32_t progress)
     {
         AscendC::LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
-        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        AscendC::DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
         outQueueZ.FreeTensor(zLocal);
     }
 
@@ -71,14 +141,19 @@ private:
     AscendC::GlobalTensor<half> xGm;
     AscendC::GlobalTensor<half> yGm;
     AscendC::GlobalTensor<half> zGm;
-    uint32_t blockLength;
-    uint32_t tileNum;
-    uint32_t tileLength;
 };
 
 extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
 {
     KernelAdd op;
-    op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
-    op.Process();
+    #ifdef __DEV_C220_VEC__
+        op.Init(x, y, z);
+    #endif
+    // op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom<<<blockDim, nullptr, stream>>>(x, y, z);
 }
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/main.cpp b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/main.cpp
index 527b07f74..b29ed3a95 100644
--- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/main.cpp
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/main.cpp
@@ -19,10 +19,10 @@ extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z
 
 int32_t main(int32_t argc, char *argv[])
 {
-    uint32_t blockDim = 8;
+    uint32_t blockDim = 48;
     size_t tilingSize = 2 * sizeof(uint32_t);
-    size_t inputByteSize = 8 * 2048 * sizeof(uint16_t);
-    size_t outputByteSize = 8 * 2048 * sizeof(uint16_t);
+    size_t inputByteSize = 300 * 1024 * 1024 * 5;
+    size_t outputByteSize = 300 * 1024 * 1024 * 5;
 
 #ifdef ASCENDC_CPU_DEBUG
     uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
-- 
Gitee