From f0818d20fe6d160c66782977685b4bd02851aad7 Mon Sep 17 00:00:00 2001 From: liuhanwen8 Date: Wed, 6 Sep 2023 20:36:07 +0800 Subject: [PATCH 1/6] add reduce_max pytorch adapter --- .../op_dev/op_host/reduce_max_custom.cpp | 121 +++++++++++++++++ .../op_dev/op_host/reduce_max_custom_tiling.h | 21 +++ .../op_dev/op_kernel/reduce_max_custom.cpp | 125 ++++++++++++++++++ .../ReduceMaxCustomKernelNpu.cpp | 32 +++++ .../pytorch_patch/npu_native_functions.yaml | 3 +- .../pytorch_invocation/readme.md | 7 +- .../pytorch_invocation/run.sh | 46 +++---- .../pytorch_invocation/test_ops_custom.py | 20 +++ 8 files changed, 338 insertions(+), 37 deletions(-) create mode 100644 cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp create mode 100644 cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h create mode 100644 cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp create mode 100644 cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp new file mode 100644 index 000000000..84f3548fb --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#include "reduce_max_custom_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +constexpr int32_t ELE_NUM_PER_BANK = 16; // half +constexpr int32_t BATCH_PER_CORE = 32; + +namespace optiling { +static ge::graphStatus TilingFunc(gert::TilingContext* context) +{ + ReduceMaxTilingData tiling; + const gert::StorageShape* xShapePtr = context->GetInputShape(0); + const gert::Shape& xShape = xShapePtr->GetStorageShape(); + const gert::RuntimeAttrs* attrs = context->GetAttrs(); + const uint32_t* reduceDim = attrs->GetAttrPointer(0); + const uint32_t xEleNum = context->GetInputTensor(0)->GetShapeSize(); + const uint32_t colNum = xShape.GetDim(*reduceDim); + const uint32_t rowNum = xEleNum / colNum; + + auto ascendcPlatform = platform_ascendc::PlatformAscendc(context->GetPlatformInfo()); + const uint32_t actCoreNum = ascendcPlatform.GetCoreNumAiv(); + uint32_t oneRepeatCalcount = BATCH_PER_CORE; + uint32_t allTasks = (rowNum + oneRepeatCalcount - 1) / oneRepeatCalcount; + uint32_t usedCoreNum = allTasks; + if (usedCoreNum > actCoreNum) { + usedCoreNum = actCoreNum; + } + uint32_t calTaskPerCore = allTasks / usedCoreNum; + uint32_t theSplitCore = allTasks % usedCoreNum; + uint32_t lastTaskTail = rowNum % oneRepeatCalcount; + if (lastTaskTail == 0) { + lastTaskTail = oneRepeatCalcount; + } + uint32_t colCalcount = (colNum + ELE_NUM_PER_BANK - 1) / ELE_NUM_PER_BANK * ELE_NUM_PER_BANK; + + tiling.set_coreNum(usedCoreNum); + tiling.set_realColVal(colNum); + tiling.set_oneCalNum(oneRepeatCalcount); + tiling.set_calTaskPerCore(calTaskPerCore); + tiling.set_theSplitCore(theSplitCore); + tiling.set_roundColVal(colCalcount); + tiling.set_lastTaskTail(lastTaskTail); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + context->SetTilingKey(1); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static graphStatus InferShape(gert::InferShapeContext* context) +{ + const gert::Shape* xShape = context->GetInputShape(0); + const gert::RuntimeAttrs* attrs = context->GetAttrs(); + const uint32_t* reduceDim = attrs->GetAttrPointer(0); + const uint32_t* isKeepDim = attrs->GetAttrPointer(1); + gert::Shape* yShape = context->GetOutputShape(0); + gert::Shape* idxShape = context->GetOutputShape(1); + + for (size_t i = 0; i < xShape->GetDimNum(); i++) { + if (i == *reduceDim) { + if (*idKeepDim) { + yShape->AppendDim(1); + } else { + continue; + } + } else { + yShape->AppendDim(xShape->GetDim(i)); + } + } + *idxShape = *yShape; + + return GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { +class ReduceMaxCustom : public OpDef { +public: + explicit ReduceMaxCustom(const char* name) : OpDef(name) + { + this->Input("x") + .ParamType(REQUIRED) + .DataType({ ge::DT_FLOAT16 }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + this->Output("y") + .ParamType(REQUIRED) + .DataType({ ge::DT_FLOAT16 }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + this->Output("idx") + .ParamType(REQUIRED) + .DataType({ ge::DT_INT32 }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + + this->Attr("reduceDim") + .AttrType(REQUIRED) + .Int(); + this->Attr("isKeepDim") + .AttrType(OPTIONAL) + .Int(1); + + this->SetInferShape(ge::InferShape); + + this->AICore() + .SetTiling(optiling::TilingFunc); + + this->AICore().AddConfig("ascend910"); + } +}; + +OP_ADD(ReduceMaxCustom); +} // namespace ops diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h new file mode 100644 index 000000000..05f0f8870 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#ifndef REDUCE_MAX_CUSTOM_TILING_H +#define REDUCE_MAX_CUSTOM_TILING_H +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(ReduceMaxTilingData) + TILING_DATA_FIELD_DEF(uint32_t, coreNum); + TILING_DATA_FIELD_DEF(uint32_t, realColVal); + TILING_DATA_FIELD_DEF(uint32_t, oneCalNum); + TILING_DATA_FIELD_DEF(uint32_t, calTaskPerCore); + TILING_DATA_FIELD_DEF(uint32_t, theSplitCore); + TILING_DATA_FIELD_DEF(uint32_t, roundColVal); + TILING_DATA_FIELD_DEF(uint32_t, lastTaskTail); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(ReduceMaxCustom, ReduceMaxTilingData) +} +#endif // REDUCE_MAX_CUSTOM_TILING_H diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp new file mode 100644 index 000000000..a59686a8b --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp @@ -0,0 +1,125 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#include "kernel_operator.h" +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 2; + +template +class KernelReduceMax { +public: + __aicore__ inline KernelReduceMax() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR idx, uint32_t coreNum, uint32_t realColVal, + uint32_t oneCalNum, uint32_t calTaskPerCore, uint32_t theSplitCore, + uint32_t roundColVal, uint32_t lastTaskTail) + { + this->isLastCore = GetBlockIdx() == GetBlockNum() - 1 ? true : false; + this->colCalcount = roundColVal; + this->oneRepeatCalcount = oneCalNum; + this->colNum = realColVal; + this->tailSize = roundColVal - realColVal; + this->lastTaskTail = lastTaskTail; + + int32_t offsetX = (calTaskPerCore + 1) * GetBlockIdx() * oneCalNum * realColVal; + int32_t offsetY = (calTaskPerCore + 1) * GetBlockIdx() * oneCalNum; + if (theSplitCore <= GetBlockIdx()) { + offsetX = (calTaskPerCore + 1) * theSplitCore * oneCalNum * realColVal + + (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore * realColVal; + offsetY = (calTaskPerCore + 1) * theSplitCore * oneCalNum + + (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore; + } + + xGm.SetGlobalBuffer((__gm__ T*)x + offsetX, colCalcount); + yGm.SetGlobalBuffer((__gm__ T*)y + offsetY, oneRepeatCalcount); + idxGm.SetGlobalBuffer((__gm__ uint32_t*)idx + offsetY, oneRepeatCalcount); + + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->colCalcount * sizeof(T)); + pipe.InitBuffer(outQueueY, BUFFER_NUM, this->oneRepeatCalcount * sizeof(T)); + pipe.InitBuffer(outQueueIdx, BUFFER_NUM, this->oneRepeatCalcount * sizeof(uint32_t)); + pipe.InitBuffer(yTmp, this->colCalcount * sizeof(T)); + pipe.InitBuffer(workTmp, this->colCalcount * sizeof(T)); + } + __aicore__ inline void Process(int32_t loopCount, uint32_t innerTask) + { + for (int32_t i = 0; i < loopCount; i++) { + if (this->isLastCore && i == loopCount - 1) { + innerTask = this->lastTaskTail; + } + CopyIn(i); + Compute(i, innerTask); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + LocalTensor xLocal = inQueueX.AllocTensor(); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute(int32_t progress, uint32_t innerTask) + { + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = outQueueY.AllocTensor(); + LocalTensor idxLocal = outQueueIdx.AllocTensor(); + + LocalTensor yTmpLocal = yTmp.Get(); + LocalTensor workTmpLocal = workTmp.Get(); + for (int32_t i = 0; i < innerTask; i++) { + DataCopy(xLocal, xGm[(progress * this->oneRepeatCalcount + i) * this->colNum], this->colCalcount); + ReduceMax(yTmpLocal, xLocal, workTmpLocal, this->colCalcount, true); + yLocal.SetValue(i, yTmpLocal.GetValue(0)); + T indexVal = yTmpLocal.GetValue(1); + uint32_t index = 0; + index = *reinterpret_cast(&indexVal); + idxLocal.SetValue(1, index); + } + outQueueY.EnQue(yLocal); + outQueueIdx.EnQue(idxLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + LocalTensor yLocal = outQueueY.DeQue(); + LocalTensor idxLocal = outQueueIdx.DeQue(); + DataCopy(yGm[progress * this->oneRepeatCalcount], yLocal, this->oneRepeatCalcount); + DataCopy(idxGm[progress * this->oneRepeatCalcount], idxLocal, this->oneRepeatCalcount); + outQueueY.FreeTensor(yLocal); + outQueueIdx.FreeTensor(idxLocal); + } + +private: + TPipe pipe; + TQue inQueueX; + TQue outQueueY, outQueueIdx; + + TBuf yTmp, workTmp; + GlobalTensor xGm; + GlobalTensor yGm; + GlobalTensor idxGm; + + int32_t colCalcount; + int32_t oneRepeatCalcount; + int32_t colNum; + int32_t tailSize; + int32_t lastTaskTail; + bool isLastCore; +}; + +extern "C" __global__ __aicore__ void reduce_max_custom(GM_ADDR x, GM_ADDR y, GM_ADDR idx, GM_ADDR workspace, + GM_ADDR tiling) +{ + GET_TILING_DATA(tilingData, tiling); + KernelReduceMax op; + op.Init(x, y, idx, tilingData.coreNum, tilingData.realColVal, tilingData.oneCalNum, + tilingData.calTaskPerCore, tilingData.theSplitCore, tilingData.roundColVal, + tilingData.lastTaskTail); + int32_t loopCount = tilingData.calTaskPerCore + 1; + if (tilingData.theSplitCore <= GetBlockIdx()) { + loopCount = tilingData.calTaskPerCore; + } + if (TILING_KEY_IS(1)) { + op.Process(loopCount, tilingData.oneCalNum); + } +} diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp new file mode 100644 index 000000000..f20f309c5 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp @@ -0,0 +1,32 @@ +#include + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/aten/ops/op_api/op_api_common.h" + +namespace at_npu { +namespace native { +using torch::autograd::Function; +using torch::autograd::AutogradContext; + +tuple NPUNativeFunctions::npu_reduce_max_custom(const at::Tensor& x, + int64_t reduceDim, + int64_t isKeepDim) { + at::IntArrayRef dim = reduceDim; + bool keepDimBool = true ? isKeepDim : false; + at::Tensor y = OpPreparation::ApplyTensorWithoutFormat(x, reduce_ops_npu_output_size(x, dim, true)); + at::Tensor idx = NPUNativeFunctions::npu_dtype_cast(y, at::kInt); + int64_t lastDimVal = x.sizes().size() - 1; + at::Tensor xTrans = x; + for (int64_t i = reduceDim; i < lastDimVal; i++) { + xTrans = xTrans.transpose(i, i + 1).contiguous(); + } + EXEC_NPU_CMD(aclnnReduceMaxCustom, xTrans, lastDimVal, isKeepDim, y, idx); + auto outputShape = reduce_ops_npu_output_size(x, dim, keepDimBool); + y = y.reshape(outputShape); + idx = idx.reshape(outputShape); + return tuple(y, idx); +} +} // namespace native +} // namespace at_npu diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml index 1ad90dbcf..ac6766375 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml @@ -1 +1,2 @@ -- func: npu_add_custom(Tensor x, Tensor y) -> Tensor \ No newline at end of file +- func: npu_add_custom(Tensor x, Tensor y) -> Tensor +- func: npu_reduce_max_custom(Tensor x, int reduceDim, int isKeepDim=1) -> (Tensor, Tensor) diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md index 104a863ef..846beba34 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md @@ -1,5 +1,2 @@ -# acl samples -bash run.sh ${is_dynamic}(0/1) ${replay_mode}(/batch/iterator) - -# run static op (depend on chip version) -bash run.sh 0 +# run pytorch samples +bash run.sh diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh index e2da06738..3a7466f08 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh @@ -10,9 +10,7 @@ CURRENT_DIR=$( ); cd $CURRENT_DIR # 导出环境变量 -IS_DYNAMIC=$1 -REPLAY_MODE=$2 -PYTORCH_VERSION=1.11.0 +PYTORCH_VERSION=5.0.rc2-pytorch1.11.0 PTA_DIR=pytorch-v${PYTORCH_VERSION} if [ ! $ASCEND_HOME_DIR ]; then @@ -59,18 +57,6 @@ print(get_soc_version()) } function main() { - if [[ ${IS_DYNAMIC}"x" = "x" ]]; then - echo "ERROR: IS_DYNAMIC is invalid!" - return 1 - fi - - if [[ ${REPLAY_MODE}"x" = "x" || ${REPLAY_MODE} = "batch" || ${REPLAY_MODE} = "iterator" ]]; then - echo "INFO: REPLAY_MODE valid : ${REPLAY_MODE}" - else - echo "ERROR: REPLAY_MODE is invalid!" - return 1 - fi - # 清除遗留生成文件和日志文件 rm -rf $HOME/ascend/log/* rm -rf $ASCEND_OPP_PATH/vendors/* @@ -79,7 +65,7 @@ function main() { # 生成自定义算子工程样例 JSON_NAME=add_custom CAMEL_JSON_NAME=`echo $JSON_NAME | sed -r 's/(^|-|_)(\w)/\U\2/g'` - msopgen gen -i op_dev/${JSON_NAME}.json -f tf -c ai_core-${SOC_SHORT_VERSION} -lan cpp -out ./custom_op + msopgen gen -i op_dev/${JSON_NAME}.json -f tf -c ai_core-${SOC_FULL_VERSION} -lan cpp -out ./custom_op if [ $? -ne 0 ]; then echo "ERROR: msopgen custom op sample failed!" return 1 @@ -91,13 +77,6 @@ function main() { echo "ERROR: copy custom op files failed!" return 1 fi - if [[ $IS_DYNAMIC != 1 ]]; then - if [[ $REPLAY_MODE = "batch" ]]; then - sed -i "s/set(BATCH_MODE_REPLAY_LIST/set(BATCH_MODE_REPLAY_LIST ${CAMEL_JSON_NAME}/g" `grep "set(BATCH_MODE_REPLAY_LIST" -rl custom_op/op_kernel/CMakeLists.txt` - elif [[ $REPLAY_MODE = "iterator" ]]; then - sed -i "s/set(ITERATOR_MODE_REPLAY_LIST/set(ITERATOR_MODE_REPLAY_LIST ${CAMEL_JSON_NAME}/g" `grep "set(ITERATOR_MODE_REPLAY_LIST" -rl custom_op/op_kernel/CMakeLists.txt` - fi - fi sed -i "s#/usr/local/Ascend/latest#$ASCEND_HOME_DIR#g" `grep "/usr/local/Ascend/latest" -rl custom_op/CMakePresets.json` # 构建自定义算子包并安装 @@ -109,17 +88,22 @@ function main() { echo "INFO: build and install custom op run package success!" # PTA源码仓,可以自行放置zip包 - if [ ! -f "v${PYTORCH_VERSION}.zip" ]; then - wget https://gitee.com/ascend/pytorch/repository/archive/v${PYTORCH_VERSION}.zip --no-check-certificate + if [ ! -d "${PTA_DIR}" ]; then + if [ ! -f "v${PYTORCH_VERSION}.zip" ]; then + wget https://gitee.com/ascend/pytorch/repository/archive/v${PYTORCH_VERSION}.zip --no-check-certificate + fi + unzip -o -q v${PYTORCH_VERSION}.zip fi - rm -rf ${PTA_DIR}; unzip -o -q v${PYTORCH_VERSION}.zip # PTA自定义算子注册 - FUNCTION_REGISTE_FIELD=`cat pytorch_patch/npu_native_functions.yaml` + FUNCTION_REGISTE_FIELD=`pytorch_patch/npu_native_functions.yaml` FUNCTION_REGISTE_FILE="${PTA_DIR}/torch_npu/csrc/aten/npu_native_functions.yaml" - if ! grep -q "\ $FUNCTION_REGISTE_FIELD" $FUNCTION_REGISTE_FILE; then - sed -i "/custom:/a \ $FUNCTION_REGISTE_FIELD" $FUNCTION_REGISTE_FILE - fi + cat $FUNCTION_REGISTE_FIELD | while read line + do + if ! grep -q "\ $line" $FUNCTION_REGISTE_FILE; then + sed -i "/custom:/a \ $line" $FUNCTION_REGISTE_FILE + fi + done # PTA自定义算子适配文件 cp -rf pytorch_patch/*.cpp ${PTA_DIR}/torch_npu/csrc/aten/ops/op_api @@ -190,7 +174,7 @@ function main() { echo "ERROR: timeline files not exist" return 1 fi - echo "INFO: Ascend C Add Custom SUCCESS" + echo "INFO: Test Ascend C Custom Op SUCCESS" } check_soc_version diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py index 5f7a07ca4..635c37303 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py @@ -24,5 +24,25 @@ class TestCustomAdd(TestCase): self.assertRtolEqual(output, x + y) +class TestReduceMaxCustom(TestCase): + def test_reduce_max_custom(self): + x_shape = [8, 255, 64, 13] + reduce_dim = 2 + assert x_shape[reduce_dim] % 16 == 0 + is_keep_dim = 1 + keep_dim = True if is_keep_dim else False + x = torch.rand(x_shape, dtype=torch.float16) + + prof_path = "./prof_total" + with torch.npu.profile(prof_path) as prof: + torch.npu.synchronize() + output = torch_npu.npu_reduce_max_custom(x.npu(), reduce_dim, is_keep_dim) + torch.npu.synchronize() + cpu_out = torch.max(x, reduce_dim, keep_dim) + + assert torch.allclose(npu_out[0].cpu(), cpu_out[0], rtol=1e-3, atol=1e-3) + assert torch.allclose(npu_out[1].cpu().long(), cpu_out[1], rtol=1e-3, atol=1e-3) + + if __name__ == "__main__": run_tests() -- Gitee From 1cdb5aaeff8f484f9aaeee38e12aa57c20100002 Mon Sep 17 00:00:00 2001 From: liuhanwen8 Date: Wed, 6 Sep 2023 21:06:59 +0800 Subject: [PATCH 2/6] fix lint --- .../acl_invocation/op_dev/op_host/reduce_max_custom.cpp | 5 +++-- .../pytorch_invocation/test_ops_custom.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp index 84f3548fb..8caa766c8 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp @@ -20,7 +20,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) const uint32_t colNum = xShape.GetDim(*reduceDim); const uint32_t rowNum = xEleNum / colNum; - auto ascendcPlatform = platform_ascendc::PlatformAscendc(context->GetPlatformInfo()); + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); const uint32_t actCoreNum = ascendcPlatform.GetCoreNumAiv(); uint32_t oneRepeatCalcount = BATCH_PER_CORE; uint32_t allTasks = (rowNum + oneRepeatCalcount - 1) / oneRepeatCalcount; @@ -47,6 +47,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); context->SetTilingKey(1); + context->SetBlockDim(usedCoreNum); size_t *currentWorkspace = context->GetWorkspaceSizes(1); currentWorkspace[0] = 0; return ge::GRAPH_SUCCESS; @@ -65,7 +66,7 @@ static graphStatus InferShape(gert::InferShapeContext* context) for (size_t i = 0; i < xShape->GetDimNum(); i++) { if (i == *reduceDim) { - if (*idKeepDim) { + if (*isKeepDim) { yShape->AppendDim(1); } else { continue; diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py index 635c37303..ac716c674 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py @@ -36,7 +36,7 @@ class TestReduceMaxCustom(TestCase): prof_path = "./prof_total" with torch.npu.profile(prof_path) as prof: torch.npu.synchronize() - output = torch_npu.npu_reduce_max_custom(x.npu(), reduce_dim, is_keep_dim) + npu_out = torch_npu.npu_reduce_max_custom(x.npu(), reduce_dim, is_keep_dim) torch.npu.synchronize() cpu_out = torch.max(x, reduce_dim, keep_dim) -- Gitee From 7fd06e2c983f4c7af74f6033607bde079c68fc50 Mon Sep 17 00:00:00 2001 From: liuhanwen8 Date: Thu, 7 Sep 2023 09:19:44 +0800 Subject: [PATCH 3/6] fix lint --- .../op_dev/op_kernel/reduce_max_custom.cpp | 66 ++++++++++--------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp index a59686a8b..09eb6ba9d 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp @@ -30,15 +30,17 @@ public: (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore; } - xGm.SetGlobalBuffer((__gm__ T*)x + offsetX, colCalcount); - yGm.SetGlobalBuffer((__gm__ T*)y + offsetY, oneRepeatCalcount); - idxGm.SetGlobalBuffer((__gm__ uint32_t*)idx + offsetY, oneRepeatCalcount); + xGm.SetGlobalBuffer((__gm__ T*)x + offsetX, this->colCalcount); + yGm.SetGlobalBuffer((__gm__ T*)y + offsetY, this->oneRepeatCalcount); + idxGm.SetGlobalBuffer((__gm__ uint32_t*)idx + offsetY, this->oneRepeatCalcount); pipe.InitBuffer(inQueueX, BUFFER_NUM, this->colCalcount * sizeof(T)); pipe.InitBuffer(outQueueY, BUFFER_NUM, this->oneRepeatCalcount * sizeof(T)); pipe.InitBuffer(outQueueIdx, BUFFER_NUM, this->oneRepeatCalcount * sizeof(uint32_t)); pipe.InitBuffer(yTmp, this->colCalcount * sizeof(T)); pipe.InitBuffer(workTmp, this->colCalcount * sizeof(T)); + yTmpLocal = yTmp.Get(); + workTmpLocal = workTmp.Get(); } __aicore__ inline void Process(int32_t loopCount, uint32_t innerTask) { @@ -46,45 +48,42 @@ public: if (this->isLastCore && i == loopCount - 1) { innerTask = this->lastTaskTail; } - CopyIn(i); - Compute(i, innerTask); + yLocal = outQueueY.AllocTensor(); + idxLocal = outQueueIdx.AllocTensor(); + for (int32_t j = 0; j < innerTask; j++) { + CopyIn(i, j); + Compute(); + } + outQueueY.EnQue(yLocal); + outQueueIdx.EnQue(idxLocal); CopyOut(i); } } private: - __aicore__ inline void CopyIn(int32_t progress) + __aicore__ inline void CopyIn(int32_t i, int32_t j) { - LocalTensor xLocal = inQueueX.AllocTensor(); + xLocal = inQueueX.AllocTensor(); + DataCopy(xLocal, xGm[(i * this->oneRepeatCalcount + j) * this->colNum], this->colCalcount); inQueueX.EnQue(xLocal); } - __aicore__ inline void Compute(int32_t progress, uint32_t innerTask) + __aicore__ inline void Compute(int32_t j) { - LocalTensor xLocal = inQueueX.DeQue(); - LocalTensor yLocal = outQueueY.AllocTensor(); - LocalTensor idxLocal = outQueueIdx.AllocTensor(); - - LocalTensor yTmpLocal = yTmp.Get(); - LocalTensor workTmpLocal = workTmp.Get(); - for (int32_t i = 0; i < innerTask; i++) { - DataCopy(xLocal, xGm[(progress * this->oneRepeatCalcount + i) * this->colNum], this->colCalcount); - ReduceMax(yTmpLocal, xLocal, workTmpLocal, this->colCalcount, true); - yLocal.SetValue(i, yTmpLocal.GetValue(0)); - T indexVal = yTmpLocal.GetValue(1); - uint32_t index = 0; - index = *reinterpret_cast(&indexVal); - idxLocal.SetValue(1, index); - } - outQueueY.EnQue(yLocal); - outQueueIdx.EnQue(idxLocal); + xLocal = inQueueX.DeQue(); + ReduceMax(yTmpLocal, xLocal, workTmpLocal, this->colCalcount, true); + yLocal.SetValue(j, yTmpLocal.GetValue(0)); + T indexVal = yTmpLocal.GetValue(1); + uint32_t index = 0; + index = *reinterpret_cast(&indexVal); + idxLocal.SetValue(j, index); inQueueX.FreeTensor(xLocal); } - __aicore__ inline void CopyOut(int32_t progress) + __aicore__ inline void CopyOut(int32_t i) { - LocalTensor yLocal = outQueueY.DeQue(); - LocalTensor idxLocal = outQueueIdx.DeQue(); - DataCopy(yGm[progress * this->oneRepeatCalcount], yLocal, this->oneRepeatCalcount); - DataCopy(idxGm[progress * this->oneRepeatCalcount], idxLocal, this->oneRepeatCalcount); + yLocal = outQueueY.DeQue(); + idxLocal = outQueueIdx.DeQue(); + DataCopy(yGm[i * this->oneRepeatCalcount], yLocal, this->oneRepeatCalcount); + DataCopy(idxGm[i * this->oneRepeatCalcount], idxLocal, this->oneRepeatCalcount); outQueueY.FreeTensor(yLocal); outQueueIdx.FreeTensor(idxLocal); } @@ -95,10 +94,13 @@ private: TQue outQueueY, outQueueIdx; TBuf yTmp, workTmp; - GlobalTensor xGm; - GlobalTensor yGm; + GlobalTensor xGm, yGm; GlobalTensor idxGm; + LocalTensor yTmpLocal, workTmpLocal; + LocalTensor xLocal, yLocal; + LocalTensor idxLocal; + int32_t colCalcount; int32_t oneRepeatCalcount; int32_t colNum; -- Gitee From f9062e1d2e80facfafde67c14dbe18cd5d817a42 Mon Sep 17 00:00:00 2001 From: liuhanwen8 Date: Thu, 7 Sep 2023 09:53:06 +0800 Subject: [PATCH 4/6] fix lint --- .../acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp | 2 +- .../4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp index 09eb6ba9d..05d53edbe 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp @@ -52,7 +52,7 @@ public: idxLocal = outQueueIdx.AllocTensor(); for (int32_t j = 0; j < innerTask; j++) { CopyIn(i, j); - Compute(); + Compute(j); } outQueueY.EnQue(yLocal); outQueueIdx.EnQue(idxLocal); diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh index 3a7466f08..039ca4812 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh @@ -96,7 +96,7 @@ function main() { fi # PTA自定义算子注册 - FUNCTION_REGISTE_FIELD=`pytorch_patch/npu_native_functions.yaml` + FUNCTION_REGISTE_FIELD="pytorch_patch/npu_native_functions.yaml" FUNCTION_REGISTE_FILE="${PTA_DIR}/torch_npu/csrc/aten/npu_native_functions.yaml" cat $FUNCTION_REGISTE_FIELD | while read line do -- Gitee From 2411ec73e4aeb10f8360f16092bf17258309c758 Mon Sep 17 00:00:00 2001 From: liuhanwen8 Date: Thu, 7 Sep 2023 11:31:23 +0800 Subject: [PATCH 5/6] delete summary check --- .../pytorch_invocation/run.sh | 64 ++++--------------- 1 file changed, 11 insertions(+), 53 deletions(-) diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh index 039ca4812..8928257ff 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh @@ -120,60 +120,18 @@ function main() { # 解析dump文件为numpy文件 files=$(ls ./prof_total) - cd $CURRENT_DIR/prof_total/$files - msprof --export=on --output=$CURRENT_DIR/prof_total/$files - if [[ $? -eq 0 ]];then - echo "INFO: parse success" - else - echo "ERROR: pasrse failed" - return 1 - fi - - # 校验summary文件夹 - summary_list=( - acl_0_1.csv - acl_statistic_0_1.csv - ge_op_execute_0_1.csv - op_statistic_0_1.csv - op_summary_0_1.csv - prof_rule_0.json - runtime_api_0_1.csv - task_time_0_1.csv - ) - if [ $(ls ./device_*/summary/ | wc -l) -eq ${#summary_list[@]} ];then - for summary in ${summary_list[@]}; do - if [ ! -f $(pwd)/device_0/summary/$summary ];then - echo "ERROR: summary files not exist" - return 1 - fi - done - echo "INFO: All summary result exist" - else - echo "ERROR: check summary result fail" - return 1 - fi + for line in $files; + do + cd $CURRENT_DIR/prof_total/$line + msprof --export=on --output=$CURRENT_DIR/prof_total/$line + if [[ $? -eq 0 ]];then + echo "INFO: parse success" + else + echo "ERROR: pasrse failed" + return 1 + fi + done - # 校验timeline文件夹 - timeline_list=( - acl_0_1.json - ge_op_execute_0_1.json - msprof_0_1.json - runtime_api_0_1.json - task_time_0_1.json - thread_group_0_1.json - ) - if [ $(ls ./device_*/timeline/ | wc -l) -eq ${#timeline_list[@]} ];then - for timeline in ${timeline_list[@]}; do - if [ ! -f $(pwd)/device_0/timeline/$timeline ];then - echo "ERROR: timeline files not exist" - return 1 - fi - done - echo "INFO: timeline files exist" - else - echo "ERROR: timeline files not exist" - return 1 - fi echo "INFO: Test Ascend C Custom Op SUCCESS" } -- Gitee From 546b4750462b4a55e86bdf1c7e181ac2e6ea85b6 Mon Sep 17 00:00:00 2001 From: liuhanwen8 Date: Thu, 7 Sep 2023 14:22:00 +0800 Subject: [PATCH 6/6] add summary check --- .../pytorch_invocation/run.sh | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh index 8928257ff..97d52d2a4 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh @@ -56,6 +56,51 @@ print(get_soc_version()) SOC_SHORT_VERSION=`echo $SOC_VERSION_CONCAT | cut -d ',' -f 2` } +# 校验summary文件夹 +function check_summary() { + summary_filename_list=( + api_statistic_0_1 + op_statistic_0_1 + op_summary_0_1 + prof_rule_0 + task_time_0_1 + ) + files=$(ls ./device_*/summary/) + if [ $(ls ./device_*/summary/ | wc -l) -eq ${#summary_filename_list[@]} ];then + for filename in ${summary_filename_list[@]}; do + if [[ *$filename* == $files ]];then + echo "ERROR: summary files not exist" + return 1 + fi + done + echo "INFO: All summary result exist" + else + echo "ERROR: check summary result fail" + return 1 + fi +} + +# 校验timeline文件夹 +function check_timeline() { + timeline_filename_list=( + msprof_0_1 + task_time_0_1 + ) + files=$(ls ./device_*/timeline/) + if [ $(ls ./device_*/timeline/ | wc -l) -eq ${#timeline_filename_list[@]} ];then + for filename in ${timeline_filename_list[@]}; do + if [[ *$filename* == $files ]];then + echo "ERROR: timeline files not exist" + return 1 + fi + done + echo "INFO: All timeline result exist" + else + echo "ERROR: check timeline result fail" + return 1 + fi +} + function main() { # 清除遗留生成文件和日志文件 rm -rf $HOME/ascend/log/* @@ -130,6 +175,9 @@ function main() { echo "ERROR: pasrse failed" return 1 fi + + check_summary + check_timeline done echo "INFO: Test Ascend C Custom Op SUCCESS" -- Gitee