diff --git a/sample/build/build.sh b/sample/build/build.sh index 80e17f8d518cf774745be22318a394d34a247559..4149b41936a4659644a28071f147fffe89090dd2 100644 --- a/sample/build/build.sh +++ b/sample/build/build.sh @@ -13,7 +13,12 @@ cd ${TOP_DIR}/third_party/lib cp -f libruntime.so.$(arch) libruntime.so cp -f libruntime_camodel.so.$(arch) libruntime_camodel.so -# +# add cd ${TOP_DIR}/normal_sample/vec_only make +mv *.fatbin ${TOP_DIR}/build + +# matmul +cd ${TOP_DIR}/normal_sample/cube_only +make mv *.fatbin ${TOP_DIR}/build \ No newline at end of file diff --git a/sample/normal_sample/cube_only/Makefile b/sample/normal_sample/cube_only/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..5b70e161cb44876d4a4b22e01cc6edd21d0f6df6 --- /dev/null +++ b/sample/normal_sample/cube_only/Makefile @@ -0,0 +1,32 @@ +# Location of the CANN +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest +TOP_DIR ?= / + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec +COMPILER_FLAG := -xcce -O2 -std=c++17 +LINK_FLAG := --cce-fatobj-link +DAV_FLAG := --cce-aicore-arch=dav-c220 +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include +TILING_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw +HOST_INC_FLAG := -I${TOP_DIR}/third_party/inc +LINK_LIBS := -L${TOP_DIR}/third_party/lib -lruntime -lascendcl -lstdc++ + +all: build + +build: matmul_leakyrelu_kernel.o main.o tiling.o matmul_leakyrelu.fatbin + +matmul_leakyrelu_kernel.o: matmul_leakyrelu_kernel.cpp + $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) -DTILING_KEY_VAR=0 $(ASCENDC_INC_FLAG) -o $@ -c $^ + +main.o: main.cpp + $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(HOST_INC_FLAG) -o $@ -c $^ + +tiling.o: tiling.cpp + $(COMPILER) $(COMPILER_FLAG) $(TILING_INC_FLAG) -o $@ -c $^ + +matmul_leakyrelu.fatbin: matmul_leakyrelu_kernel.o main.o tiling.o + $(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ $(LINK_LIBS) + +.PHONY: clean +clean: + rm *.o *.fatbin \ No newline at end of file diff --git a/sample/normal_sample/cube_only/main.cpp b/sample/normal_sample/cube_only/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0d26ea26a4efe943108cfc694fc826fc3571d602 --- /dev/null +++ b/sample/normal_sample/cube_only/main.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * This file constains code of cpu debug and npu code.We read data from bin file + * and write result to file. + */ + +#include +#include +#include "kernel_tiling/kernel_tiling.h" +#include "acl/acl.h" + +extern void matmul_custom_do( + uint32_t coreDim, void *l2ctrl, void *stream, uint8_t *param1, uint8_t *param2, uint8_t *param3, uint8_t *param4); + +#define ACL_ERROR_NONE 0 + +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + printf("%s: %d aclError %d\n", __FILE__, __LINE__, __ret); \ + } \ + } while (0) + +void printTensor(float *ptr, size_t size) +{ + size_t colNum = 8; + for (size_t i = 0; i < size / colNum / sizeof(float); i++) { + for (size_t j = 0; j < colNum; j++) { + printf("%5.2f ", ptr[colNum * i + j]); + } + printf("\n"); + } +} + +void fillValue(aclFloat16 *addr, size_t size) +{ + aclFloat16 val = aclFloatToFloat16(4.0f); + for (size_t i = 0; i < size / sizeof(aclFloat16); i++) { + addr[i] = val; + } +} + +void printAclFloat16(aclFloat16 *addr) +{ + for (int i = 0; i < 16; i++) { + printf("%f ", aclFloat16ToFloat(addr[i])); + } +} + + +void MakeTiling(uint32_t *addr, size_t size) +{ + assert(sizeof(TCubeTiling) <= size); + // TCubeTiling该结构体参考kernel_tiling/kernel_tiling.h中的结构体定义 + // tiling_api.h中本身定义的结构与kernel_tiling.h + TCubeTiling *tiling = (TCubeTiling *)addr; + // 此处计算使用的核数 + tiling->usedCoreNum = 16; // (M/singleCoreM)*(N/singleCoreN)*(K/singleCoreK)=4*4*1=16 + // 对于 xa 是[M, Ka]矩阵, xb 是[Kb, N]矩阵,此处数据需要与外部格式保持一致 + // 参考 AscendC算子开发文档 + // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devguide/opdevg/ascendcopdevg/atlas_ascendc_10_0060.html + // 中对 数据分块(Tiling) 部分的介绍 + tiling->M = 512; // + tiling->N = 1024; // + tiling->Ka = 512; // Ka和Kb一般一样,只有pad的时候存在不一致,比如[1, 62]*[64, 2],这里64就是有pad的 + tiling->Kb = 512; // + tiling->isBias = 0; // 是否有bias + // 多核切分的tiling参数,用于度量单个核上处理的数据大小 + // xa在M轴上切分,分成多个singleCoreM;单核处理singleCoreM * singleCoreK大小数据 + // xb在N轴上切分,分成多个singleCoreN;单核处理singleCoreK * singleCoreN + // 由于输入在M和N轴上切分了,输出 + tiling->singleCoreM = 128; + tiling->singleCoreN = 256; + tiling->singleCoreK = 512; // 不建议对k进行切分,会导致累加,引起不确定计算 + // 核内切分的tiling参数,用于单个核内的最小计算单位 + tiling->baseM = 128; + tiling->baseN = 256; + tiling->baseK = 64; + tiling->stepM = 1; + tiling->stepN = 1; + tiling->stepKa = 8; + tiling->stepKb = 8; + tiling->depthA1 = 8; // 矩阵[baseM, baseK]的缓存数量 + tiling->depthB1 = 8; // 矩阵[basek, baseN]的缓存数量 + // 其他参数 + tiling->iterateOrder = 0; // 控制迭代的方向:0代表先M轴再N轴,1代表先N轴再M轴 + tiling->shareL1Size = 384 * 1024; // 如存在多个matmul时,可以单独控制每个使用空间 + tiling->shareL0CSize = 128 * 1024; // 如存在多个matmul时,可以单独控制每个使用空间 + tiling->shareUbSize = 0; // 310P非分核时涉及 + tiling->transLength = 131072; // 310P使用涉及格式转换时的额外空间长度 + // 下列是bmm中使用的batch参数,如果需要实现bmm,该结构体中还有其他tiling参数 + tiling->batchM = 1; // 对于普通matmul,默认1 + tiling->batchN = 1; // 对于普通matmul,默认1 + tiling->singleBatchM = 1; + tiling->singleBatchN = 1; + // 下面的db参数用于控制ping-pong + tiling->dbL0A = 2; + tiling->dbL0B = 2; + tiling->dbL0C = 1; + tiling->reserved = 0; +} + +// y = matmul(xa, xb) +int32_t main(int32_t argc, char *argv[]) +{ + size_t xaSize = 512 * 1024 * sizeof(aclFloat16); + size_t xbSize = 512 * 1024 * sizeof(aclFloat16); + size_t ySize = 512 * 1024 * sizeof(float); + size_t tilingSize = 48 * sizeof(uint32_t); + uint32_t blockDim = 8; + + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + aclFloat16 *xaHost; + CHECK_ACL(aclrtMallocHost((void **)(&xaHost), xaSize)); + fillValue(xaHost, xaSize); + + aclFloat16 *xbHost; + CHECK_ACL(aclrtMallocHost((void **)(&xbHost), xbSize)); + fillValue(xbHost, xbSize); + + uint32_t *tilingHost; + CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingSize)); + MakeTiling(tilingHost, tilingSize); + + // 将host的输入同步到device + uint8_t *xaDevice; + uint8_t *xbDevice; + uint8_t *tilingDevice; + CHECK_ACL(aclrtMalloc((void **)&xaDevice, xaSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(xaDevice, xaSize, xaHost, xaSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 xa + CHECK_ACL(aclrtMalloc((void **)&xbDevice, xbSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(xbDevice, xbSize, xbHost, xbSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 xb + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, tilingHost, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 tiling + + uint8_t *yDevice; + CHECK_ACL(aclrtMalloc((void **)&yDevice, ySize, ACL_MEM_MALLOC_HUGE_FIRST)); // 准备 输出 + + matmul_custom_do(blockDim, nullptr, stream, xaDevice, xbDevice, yDevice, tilingDevice); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // 将device的输出同步到host + float *yHost; + CHECK_ACL(aclrtMallocHost((void **)(&yHost), ySize)); + CHECK_ACL(aclrtMemcpy(yHost, ySize, yDevice, ySize, ACL_MEMCPY_DEVICE_TO_HOST)); + printTensor(yHost, 4 * 8 * 4); + + // 释放资源 + CHECK_ACL(aclrtFree(xaDevice)); + CHECK_ACL(aclrtFree(xbDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); + CHECK_ACL(aclrtFree(yDevice)); + + CHECK_ACL(aclrtFreeHost(xaHost)); + CHECK_ACL(aclrtFreeHost(xbHost)); + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); + return 0; +} \ No newline at end of file diff --git a/sample/normal_sample/cube_only/matmul_kernel.cpp b/sample/normal_sample/cube_only/matmul_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9ed629c5145199223e835f83ae90ec6e5d7a01aa --- /dev/null +++ b/sample/normal_sample/cube_only/matmul_kernel.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ + +#include "kernel_operator.h" +#include "lib/matrix/matmul/matmul.h" +using namespace AscendC; +using namespace matmul; + +__aicore__ inline void CalcGMOffset( + int blockIdx, int usedCoreNum, TCubeTiling ¶m, int &offsetA, int &offsetB, int &offsetC) +{ + ASSERT(blockIdx < usedCoreNum); + uint32_t mIterSize = Ceil(param.M, param.singleCoreM); + ASSERT(mIterSize != 0); + uint32_t mCoreIndx = blockIdx % mIterSize; + uint32_t nCoreIndx = blockIdx / mIterSize; + + offsetA = mCoreIndx * param.Ka * param.singleCoreM; + offsetB = nCoreIndx * param.singleCoreN; + offsetC = mCoreIndx * param.N * param.singleCoreM + nCoreIndx * param.singleCoreN; + + // tail M + int gmUseM = param.M - mCoreIndx * param.singleCoreM; + param.singleCoreM = gmUseM < param.singleCoreM ? gmUseM : param.singleCoreM; + + // tail N + int gmUseN = param.N - nCoreIndx * param.singleCoreN; + param.singleCoreN = gmUseN < param.singleCoreN ? gmUseN : param.singleCoreN; + + // tail K + int gmUseK = param.Ka; + param.singleCoreK = gmUseK < param.singleCoreK ? gmUseK : param.singleCoreK; +} + +__aicore__ inline void CopyTiling(TCubeTiling *tiling, GM_ADDR tilingGM) +{ + uint32_t *ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR tilingGm) +{ + // cube core cases, ignore vector core + if (g_coreType == AIV) { + return; + } + using A_T = half; + using B_T = half; + using C_T = float; + using BiasT = float; + + TPipe que; + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + + if (GetBlockIdx() >= tiling.usedCoreNum) { + return; + } + + GlobalTensor aGlobal; + GlobalTensor bGlobal; + GlobalTensor cGlobal; + + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ A_T *>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ B_T *>(b), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ C_T *>(c), tiling.M * tiling.N); + + int offsetA = 0; + int offsetB = 0; + int offsetC = 0; + CalcGMOffset(GetBlockIdx(), tiling.usedCoreNum, tiling, offsetA, offsetB, offsetC); + auto gmA = aGlobal[offsetA]; + auto gmB = bGlobal[offsetB]; + auto gmC = cGlobal[offsetC]; + + typedef MatmulType aType; + typedef MatmulType bType; + typedef MatmulType cType; + typedef MatmulType biasType; + MatmulImpl mm; + mm.SetSubBlockIdx(0); + mm.Init(&tiling, &que); + + mm.SetTensorA(gmA); + mm.SetTensorB(gmB); + mm.IterateAll(gmC); +} + +// call of kernel function +void matmul_custom_do( + uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *tilingGm) +{ + matmul_custom<<>>(a, b, c, tilingGm); +} \ No newline at end of file