From f0818d20fe6d160c66782977685b4bd02851aad7 Mon Sep 17 00:00:00 2001
From: liuhanwen8 <liuhanwen8@huawei.com>
Date: Wed, 6 Sep 2023 20:36:07 +0800
Subject: [PATCH 1/6] add reduce_max pytorch adapter

---
 .../op_dev/op_host/reduce_max_custom.cpp      | 121 +++++++++++++++++
 .../op_dev/op_host/reduce_max_custom_tiling.h |  21 +++
 .../op_dev/op_kernel/reduce_max_custom.cpp    | 125 ++++++++++++++++++
 .../ReduceMaxCustomKernelNpu.cpp              |  32 +++++
 .../pytorch_patch/npu_native_functions.yaml   |   3 +-
 .../pytorch_invocation/readme.md              |   7 +-
 .../pytorch_invocation/run.sh                 |  46 +++----
 .../pytorch_invocation/test_ops_custom.py     |  20 +++
 8 files changed, 338 insertions(+), 37 deletions(-)
 create mode 100644 cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp
 create mode 100644 cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h
 create mode 100644 cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
 create mode 100644 cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp

diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp
new file mode 100644
index 000000000..84f3548fb
--- /dev/null
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+#include "reduce_max_custom_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+
+constexpr int32_t ELE_NUM_PER_BANK = 16; // half
+constexpr int32_t BATCH_PER_CORE = 32;
+
+namespace optiling {
+static ge::graphStatus TilingFunc(gert::TilingContext* context)
+{
+    ReduceMaxTilingData tiling;
+    const gert::StorageShape* xShapePtr = context->GetInputShape(0);
+    const gert::Shape& xShape = xShapePtr->GetStorageShape();
+    const gert::RuntimeAttrs* attrs = context->GetAttrs();
+    const uint32_t* reduceDim = attrs->GetAttrPointer<uint32_t>(0);
+    const uint32_t xEleNum = context->GetInputTensor(0)->GetShapeSize();
+    const uint32_t colNum = xShape.GetDim(*reduceDim);
+    const uint32_t rowNum = xEleNum / colNum;
+
+    auto ascendcPlatform = platform_ascendc::PlatformAscendc(context->GetPlatformInfo());
+    const uint32_t actCoreNum = ascendcPlatform.GetCoreNumAiv();
+    uint32_t oneRepeatCalcount = BATCH_PER_CORE;
+    uint32_t allTasks = (rowNum + oneRepeatCalcount - 1) / oneRepeatCalcount;
+    uint32_t usedCoreNum = allTasks;
+    if (usedCoreNum > actCoreNum) {
+        usedCoreNum = actCoreNum;
+    }
+    uint32_t calTaskPerCore = allTasks / usedCoreNum;
+    uint32_t theSplitCore = allTasks % usedCoreNum;
+    uint32_t lastTaskTail = rowNum % oneRepeatCalcount;
+    if (lastTaskTail == 0) {
+        lastTaskTail = oneRepeatCalcount;
+    }
+    uint32_t colCalcount = (colNum + ELE_NUM_PER_BANK - 1) / ELE_NUM_PER_BANK * ELE_NUM_PER_BANK;
+
+    tiling.set_coreNum(usedCoreNum);
+    tiling.set_realColVal(colNum);
+    tiling.set_oneCalNum(oneRepeatCalcount);
+    tiling.set_calTaskPerCore(calTaskPerCore);
+    tiling.set_theSplitCore(theSplitCore);
+    tiling.set_roundColVal(colCalcount);
+    tiling.set_lastTaskTail(lastTaskTail);
+    
+    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+    context->SetTilingKey(1);
+    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+    currentWorkspace[0] = 0;
+    return ge::GRAPH_SUCCESS;
+}
+} // namespace optiling
+
+namespace ge {
+static graphStatus InferShape(gert::InferShapeContext* context)
+{
+    const gert::Shape* xShape = context->GetInputShape(0);
+    const gert::RuntimeAttrs* attrs = context->GetAttrs();
+    const uint32_t* reduceDim = attrs->GetAttrPointer<uint32_t>(0);
+    const uint32_t* isKeepDim = attrs->GetAttrPointer<uint32_t>(1);
+    gert::Shape* yShape = context->GetOutputShape(0);
+    gert::Shape* idxShape = context->GetOutputShape(1);
+    
+    for (size_t i = 0; i < xShape->GetDimNum(); i++) {
+        if (i == *reduceDim) {
+            if (*idKeepDim) {
+                yShape->AppendDim(1);
+            } else {
+                continue;
+            }
+        } else {
+            yShape->AppendDim(xShape->GetDim(i));
+        }
+    } 
+    *idxShape = *yShape;
+
+    return GRAPH_SUCCESS;
+}
+} // namespace ge
+
+namespace ops {
+class ReduceMaxCustom : public OpDef {
+public:
+    explicit ReduceMaxCustom(const char* name) : OpDef(name)
+    {
+        this->Input("x")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_FLOAT16 })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+        this->Output("y")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_FLOAT16 })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+        this->Output("idx")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_INT32 })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+
+        this->Attr("reduceDim")
+            .AttrType(REQUIRED)
+            .Int();
+        this->Attr("isKeepDim")
+            .AttrType(OPTIONAL)
+            .Int(1);
+
+        this->SetInferShape(ge::InferShape);
+
+        this->AICore()
+            .SetTiling(optiling::TilingFunc);
+
+        this->AICore().AddConfig("ascend910");
+    }
+};
+
+OP_ADD(ReduceMaxCustom);
+} // namespace ops
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h
new file mode 100644
index 000000000..05f0f8870
--- /dev/null
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+#ifndef REDUCE_MAX_CUSTOM_TILING_H
+#define REDUCE_MAX_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(ReduceMaxTilingData)
+  TILING_DATA_FIELD_DEF(uint32_t, coreNum);
+  TILING_DATA_FIELD_DEF(uint32_t, realColVal);
+  TILING_DATA_FIELD_DEF(uint32_t, oneCalNum);
+  TILING_DATA_FIELD_DEF(uint32_t, calTaskPerCore);
+  TILING_DATA_FIELD_DEF(uint32_t, theSplitCore);
+  TILING_DATA_FIELD_DEF(uint32_t, roundColVal);
+  TILING_DATA_FIELD_DEF(uint32_t, lastTaskTail);
+END_TILING_DATA_DEF;
+
+REGISTER_TILING_DATA_CLASS(ReduceMaxCustom, ReduceMaxTilingData)
+}
+#endif // REDUCE_MAX_CUSTOM_TILING_H
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
new file mode 100644
index 000000000..a59686a8b
--- /dev/null
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+#include "kernel_operator.h"
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 2;
+
+template <typename T>
+class KernelReduceMax {
+public:
+    __aicore__ inline KernelReduceMax() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR idx, uint32_t coreNum, uint32_t realColVal,
+                                uint32_t oneCalNum, uint32_t calTaskPerCore, uint32_t theSplitCore,
+                                uint32_t roundColVal, uint32_t lastTaskTail)
+    {
+        this->isLastCore = GetBlockIdx() == GetBlockNum() - 1 ? true : false;
+        this->colCalcount = roundColVal;
+        this->oneRepeatCalcount = oneCalNum;
+        this->colNum = realColVal;
+        this->tailSize = roundColVal - realColVal;
+        this->lastTaskTail = lastTaskTail;
+
+        int32_t offsetX = (calTaskPerCore + 1) * GetBlockIdx() * oneCalNum * realColVal;
+        int32_t offsetY = (calTaskPerCore + 1) * GetBlockIdx() * oneCalNum;
+        if (theSplitCore <= GetBlockIdx()) {
+            offsetX = (calTaskPerCore + 1) * theSplitCore * oneCalNum * realColVal + 
+                      (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore * realColVal;
+            offsetY = (calTaskPerCore + 1) * theSplitCore * oneCalNum + 
+                      (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore;
+        }
+
+        xGm.SetGlobalBuffer((__gm__ T*)x + offsetX, colCalcount);
+        yGm.SetGlobalBuffer((__gm__ T*)y + offsetY, oneRepeatCalcount);
+        idxGm.SetGlobalBuffer((__gm__ uint32_t*)idx + offsetY, oneRepeatCalcount);
+
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->colCalcount * sizeof(T));
+        pipe.InitBuffer(outQueueY, BUFFER_NUM, this->oneRepeatCalcount * sizeof(T));
+        pipe.InitBuffer(outQueueIdx, BUFFER_NUM, this->oneRepeatCalcount * sizeof(uint32_t));
+        pipe.InitBuffer(yTmp, this->colCalcount * sizeof(T));
+        pipe.InitBuffer(workTmp, this->colCalcount * sizeof(T));
+    }
+    __aicore__ inline void Process(int32_t loopCount, uint32_t innerTask)
+    {
+        for (int32_t i = 0; i < loopCount; i++) {
+            if (this->isLastCore && i == loopCount - 1) {
+                innerTask = this->lastTaskTail;
+            }
+            CopyIn(i);
+            Compute(i, innerTask);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
+        inQueueX.EnQue(xLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress, uint32_t innerTask)
+    {
+        LocalTensor<T> xLocal = inQueueX.DeQue<T>();
+        LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
+        LocalTensor<uint32_t> idxLocal = outQueueIdx.AllocTensor<uint32_t>();
+
+        LocalTensor<T> yTmpLocal = yTmp.Get<T>();
+        LocalTensor<T> workTmpLocal = workTmp.Get<T>();
+        for (int32_t i = 0; i < innerTask; i++) {
+            DataCopy(xLocal, xGm[(progress * this->oneRepeatCalcount + i) * this->colNum], this->colCalcount);
+            ReduceMax(yTmpLocal, xLocal, workTmpLocal, this->colCalcount, true);
+            yLocal.SetValue(i, yTmpLocal.GetValue(0));
+            T indexVal = yTmpLocal.GetValue(1);
+            uint32_t index = 0;
+            index = *reinterpret_cast<uint32_t*>(&indexVal);
+            idxLocal.SetValue(1, index);
+        }
+        outQueueY.EnQue<T>(yLocal);
+        outQueueIdx.EnQue<uint32_t>(idxLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        LocalTensor<T> yLocal = outQueueY.DeQue<T>();
+        LocalTensor<uint32_t> idxLocal = outQueueIdx.DeQue<uint32_t>();
+        DataCopy(yGm[progress * this->oneRepeatCalcount], yLocal, this->oneRepeatCalcount);
+        DataCopy(idxGm[progress * this->oneRepeatCalcount], idxLocal, this->oneRepeatCalcount);
+        outQueueY.FreeTensor(yLocal);
+        outQueueIdx.FreeTensor(idxLocal);
+    }
+
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueY, outQueueIdx;
+
+    TBuf<QuePosition::VECCALC> yTmp, workTmp;
+    GlobalTensor<T> xGm;
+    GlobalTensor<T> yGm;
+    GlobalTensor<uint32_t> idxGm;
+
+    int32_t colCalcount;
+    int32_t oneRepeatCalcount;
+    int32_t colNum;
+    int32_t tailSize;
+    int32_t lastTaskTail;
+    bool isLastCore;
+};
+
+extern "C" __global__ __aicore__ void reduce_max_custom(GM_ADDR x, GM_ADDR y, GM_ADDR idx, GM_ADDR workspace,
+                                                        GM_ADDR tiling)
+{
+    GET_TILING_DATA(tilingData, tiling);
+    KernelReduceMax<half> op;
+    op.Init(x, y, idx, tilingData.coreNum, tilingData.realColVal, tilingData.oneCalNum,
+            tilingData.calTaskPerCore, tilingData.theSplitCore, tilingData.roundColVal,
+            tilingData.lastTaskTail);
+    int32_t loopCount = tilingData.calTaskPerCore + 1;
+    if (tilingData.theSplitCore <= GetBlockIdx()) {
+        loopCount = tilingData.calTaskPerCore;
+    }
+    if (TILING_KEY_IS(1)) {
+        op.Process(loopCount, tilingData.oneCalNum);
+    }
+}
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp
new file mode 100644
index 000000000..f20f309c5
--- /dev/null
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp
@@ -0,0 +1,32 @@
+#include <torch/csrc/autograd/custom_function.h>
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/aten/ops/op_api/op_api_common.h"
+
+namespace at_npu {
+namespace native {
+using torch::autograd::Function;
+using torch::autograd::AutogradContext;
+
+tuple<at::Tensor, at::Tensor> NPUNativeFunctions::npu_reduce_max_custom(const at::Tensor& x,
+                                                                        int64_t reduceDim,
+                                                                        int64_t isKeepDim) {
+    at::IntArrayRef dim = reduceDim;
+    bool keepDimBool = true ? isKeepDim : false;
+    at::Tensor y = OpPreparation::ApplyTensorWithoutFormat(x, reduce_ops_npu_output_size(x, dim, true));
+    at::Tensor idx = NPUNativeFunctions::npu_dtype_cast(y, at::kInt);
+    int64_t lastDimVal = x.sizes().size() - 1;
+    at::Tensor xTrans = x;
+    for (int64_t i = reduceDim; i < lastDimVal; i++) {
+        xTrans = xTrans.transpose(i, i + 1).contiguous();
+    }
+    EXEC_NPU_CMD(aclnnReduceMaxCustom, xTrans, lastDimVal, isKeepDim, y, idx);
+    auto outputShape = reduce_ops_npu_output_size(x, dim, keepDimBool);
+    y = y.reshape(outputShape);
+    idx = idx.reshape(outputShape);
+    return tuple<at::Tensor, at::Tensor>(y, idx);
+}
+} // namespace native
+} // namespace at_npu
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml
index 1ad90dbcf..ac6766375 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml
@@ -1 +1,2 @@
-- func: npu_add_custom(Tensor x, Tensor y) -> Tensor
\ No newline at end of file
+- func: npu_add_custom(Tensor x, Tensor y) -> Tensor
+- func: npu_reduce_max_custom(Tensor x, int reduceDim, int isKeepDim=1) -> (Tensor, Tensor)
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md
index 104a863ef..846beba34 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md
@@ -1,5 +1,2 @@
-# acl samples
-bash run.sh ${is_dynamic}(0/1) ${replay_mode}(/batch/iterator)
-
-# run static op (depend on chip version)
-bash run.sh 0
+# run pytorch samples
+bash run.sh
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
index e2da06738..3a7466f08 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
@@ -10,9 +10,7 @@ CURRENT_DIR=$(
 ); cd $CURRENT_DIR
 
 # 导出环境变量
-IS_DYNAMIC=$1
-REPLAY_MODE=$2
-PYTORCH_VERSION=1.11.0
+PYTORCH_VERSION=5.0.rc2-pytorch1.11.0
 PTA_DIR=pytorch-v${PYTORCH_VERSION}
 
 if [ ! $ASCEND_HOME_DIR ]; then
@@ -59,18 +57,6 @@ print(get_soc_version())
 }
 
 function main() {
-    if [[ ${IS_DYNAMIC}"x" = "x" ]]; then
-        echo "ERROR: IS_DYNAMIC is invalid!"
-        return 1
-    fi
-
-    if [[ ${REPLAY_MODE}"x" = "x" || ${REPLAY_MODE} = "batch" || ${REPLAY_MODE} = "iterator" ]]; then
-        echo "INFO: REPLAY_MODE valid : ${REPLAY_MODE}"
-    else
-        echo "ERROR: REPLAY_MODE is invalid!"
-        return 1
-    fi
-
     # 清除遗留生成文件和日志文件
     rm -rf $HOME/ascend/log/*
     rm -rf $ASCEND_OPP_PATH/vendors/*
@@ -79,7 +65,7 @@ function main() {
     # 生成自定义算子工程样例
     JSON_NAME=add_custom
     CAMEL_JSON_NAME=`echo $JSON_NAME | sed -r 's/(^|-|_)(\w)/\U\2/g'`
-    msopgen gen -i op_dev/${JSON_NAME}.json -f tf -c ai_core-${SOC_SHORT_VERSION} -lan cpp -out ./custom_op
+    msopgen gen -i op_dev/${JSON_NAME}.json -f tf -c ai_core-${SOC_FULL_VERSION} -lan cpp -out ./custom_op
     if [ $? -ne 0 ]; then
         echo "ERROR: msopgen custom op sample failed!"
         return 1
@@ -91,13 +77,6 @@ function main() {
         echo "ERROR: copy custom op files failed!"
         return 1
     fi
-    if [[ $IS_DYNAMIC != 1 ]]; then
-        if [[ $REPLAY_MODE = "batch" ]]; then
-            sed -i "s/set(BATCH_MODE_REPLAY_LIST/set(BATCH_MODE_REPLAY_LIST ${CAMEL_JSON_NAME}/g" `grep "set(BATCH_MODE_REPLAY_LIST" -rl custom_op/op_kernel/CMakeLists.txt`
-        elif [[ $REPLAY_MODE = "iterator" ]]; then
-            sed -i "s/set(ITERATOR_MODE_REPLAY_LIST/set(ITERATOR_MODE_REPLAY_LIST ${CAMEL_JSON_NAME}/g" `grep "set(ITERATOR_MODE_REPLAY_LIST" -rl custom_op/op_kernel/CMakeLists.txt`
-        fi
-    fi
     sed -i "s#/usr/local/Ascend/latest#$ASCEND_HOME_DIR#g" `grep "/usr/local/Ascend/latest" -rl custom_op/CMakePresets.json`
 
     # 构建自定义算子包并安装
@@ -109,17 +88,22 @@ function main() {
     echo "INFO: build and install custom op run package success!"
 
     # PTA源码仓，可以自行放置zip包
-    if [ ! -f "v${PYTORCH_VERSION}.zip" ]; then
-        wget https://gitee.com/ascend/pytorch/repository/archive/v${PYTORCH_VERSION}.zip --no-check-certificate
+    if [ ! -d "${PTA_DIR}" ]; then
+        if [ ! -f "v${PYTORCH_VERSION}.zip" ]; then
+            wget https://gitee.com/ascend/pytorch/repository/archive/v${PYTORCH_VERSION}.zip --no-check-certificate
+        fi
+        unzip -o -q v${PYTORCH_VERSION}.zip
     fi
-    rm -rf ${PTA_DIR}; unzip -o -q v${PYTORCH_VERSION}.zip
 
     # PTA自定义算子注册
-    FUNCTION_REGISTE_FIELD=`cat pytorch_patch/npu_native_functions.yaml`
+    FUNCTION_REGISTE_FIELD=`pytorch_patch/npu_native_functions.yaml`
     FUNCTION_REGISTE_FILE="${PTA_DIR}/torch_npu/csrc/aten/npu_native_functions.yaml"
-    if ! grep -q "\  $FUNCTION_REGISTE_FIELD" $FUNCTION_REGISTE_FILE; then
-        sed -i "/custom:/a \  $FUNCTION_REGISTE_FIELD" $FUNCTION_REGISTE_FILE
-    fi
+    cat $FUNCTION_REGISTE_FIELD | while read line
+    do
+        if ! grep -q "\  $line" $FUNCTION_REGISTE_FILE; then
+            sed -i "/custom:/a \  $line" $FUNCTION_REGISTE_FILE
+        fi
+    done
     # PTA自定义算子适配文件
     cp -rf pytorch_patch/*.cpp ${PTA_DIR}/torch_npu/csrc/aten/ops/op_api
 
@@ -190,7 +174,7 @@ function main() {
        echo "ERROR: timeline files not exist"
        return 1
     fi
-    echo "INFO: Ascend C Add Custom SUCCESS"
+    echo "INFO: Test Ascend C Custom Op SUCCESS"
 }
 
 check_soc_version
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py
index 5f7a07ca4..635c37303 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py
@@ -24,5 +24,25 @@ class TestCustomAdd(TestCase):
         self.assertRtolEqual(output, x + y)
 
 
+class TestReduceMaxCustom(TestCase):
+    def test_reduce_max_custom(self):
+        x_shape = [8, 255, 64, 13]
+        reduce_dim = 2
+        assert x_shape[reduce_dim] % 16 == 0
+        is_keep_dim = 1
+        keep_dim = True if is_keep_dim else False
+        x = torch.rand(x_shape, dtype=torch.float16)
+
+        prof_path = "./prof_total"
+        with torch.npu.profile(prof_path) as prof:
+            torch.npu.synchronize()
+            output = torch_npu.npu_reduce_max_custom(x.npu(), reduce_dim, is_keep_dim)
+            torch.npu.synchronize()
+        cpu_out = torch.max(x, reduce_dim, keep_dim)
+
+        assert torch.allclose(npu_out[0].cpu(), cpu_out[0], rtol=1e-3, atol=1e-3)
+        assert torch.allclose(npu_out[1].cpu().long(), cpu_out[1], rtol=1e-3, atol=1e-3)
+
+
 if __name__ == "__main__":
     run_tests()
-- 
Gitee


From 1cdb5aaeff8f484f9aaeee38e12aa57c20100002 Mon Sep 17 00:00:00 2001
From: liuhanwen8 <liuhanwen8@huawei.com>
Date: Wed, 6 Sep 2023 21:06:59 +0800
Subject: [PATCH 2/6] fix lint

---
 .../acl_invocation/op_dev/op_host/reduce_max_custom.cpp      | 5 +++--
 .../pytorch_invocation/test_ops_custom.py                    | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp
index 84f3548fb..8caa766c8 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp
@@ -20,7 +20,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context)
     const uint32_t colNum = xShape.GetDim(*reduceDim);
     const uint32_t rowNum = xEleNum / colNum;
 
-    auto ascendcPlatform = platform_ascendc::PlatformAscendc(context->GetPlatformInfo());
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
     const uint32_t actCoreNum = ascendcPlatform.GetCoreNumAiv();
     uint32_t oneRepeatCalcount = BATCH_PER_CORE;
     uint32_t allTasks = (rowNum + oneRepeatCalcount - 1) / oneRepeatCalcount;
@@ -47,6 +47,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context)
     tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
     context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
     context->SetTilingKey(1);
+    context->SetBlockDim(usedCoreNum);
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
     currentWorkspace[0] = 0;
     return ge::GRAPH_SUCCESS;
@@ -65,7 +66,7 @@ static graphStatus InferShape(gert::InferShapeContext* context)
     
     for (size_t i = 0; i < xShape->GetDimNum(); i++) {
         if (i == *reduceDim) {
-            if (*idKeepDim) {
+            if (*isKeepDim) {
                 yShape->AppendDim(1);
             } else {
                 continue;
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py
index 635c37303..ac716c674 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py
@@ -36,7 +36,7 @@ class TestReduceMaxCustom(TestCase):
         prof_path = "./prof_total"
         with torch.npu.profile(prof_path) as prof:
             torch.npu.synchronize()
-            output = torch_npu.npu_reduce_max_custom(x.npu(), reduce_dim, is_keep_dim)
+            npu_out = torch_npu.npu_reduce_max_custom(x.npu(), reduce_dim, is_keep_dim)
             torch.npu.synchronize()
         cpu_out = torch.max(x, reduce_dim, keep_dim)
 
-- 
Gitee


From 7fd06e2c983f4c7af74f6033607bde079c68fc50 Mon Sep 17 00:00:00 2001
From: liuhanwen8 <liuhanwen8@huawei.com>
Date: Thu, 7 Sep 2023 09:19:44 +0800
Subject: [PATCH 3/6] fix lint

---
 .../op_dev/op_kernel/reduce_max_custom.cpp    | 66 ++++++++++---------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
index a59686a8b..09eb6ba9d 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
@@ -30,15 +30,17 @@ public:
                       (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore;
         }
 
-        xGm.SetGlobalBuffer((__gm__ T*)x + offsetX, colCalcount);
-        yGm.SetGlobalBuffer((__gm__ T*)y + offsetY, oneRepeatCalcount);
-        idxGm.SetGlobalBuffer((__gm__ uint32_t*)idx + offsetY, oneRepeatCalcount);
+        xGm.SetGlobalBuffer((__gm__ T*)x + offsetX, this->colCalcount);
+        yGm.SetGlobalBuffer((__gm__ T*)y + offsetY, this->oneRepeatCalcount);
+        idxGm.SetGlobalBuffer((__gm__ uint32_t*)idx + offsetY, this->oneRepeatCalcount);
 
         pipe.InitBuffer(inQueueX, BUFFER_NUM, this->colCalcount * sizeof(T));
         pipe.InitBuffer(outQueueY, BUFFER_NUM, this->oneRepeatCalcount * sizeof(T));
         pipe.InitBuffer(outQueueIdx, BUFFER_NUM, this->oneRepeatCalcount * sizeof(uint32_t));
         pipe.InitBuffer(yTmp, this->colCalcount * sizeof(T));
         pipe.InitBuffer(workTmp, this->colCalcount * sizeof(T));
+        yTmpLocal = yTmp.Get<T>();
+        workTmpLocal = workTmp.Get<T>();
     }
     __aicore__ inline void Process(int32_t loopCount, uint32_t innerTask)
     {
@@ -46,45 +48,42 @@ public:
             if (this->isLastCore && i == loopCount - 1) {
                 innerTask = this->lastTaskTail;
             }
-            CopyIn(i);
-            Compute(i, innerTask);
+            yLocal = outQueueY.AllocTensor<T>();
+            idxLocal = outQueueIdx.AllocTensor<uint32_t>();
+            for (int32_t j = 0; j < innerTask; j++) {
+                CopyIn(i, j);
+                Compute();
+            }
+            outQueueY.EnQue<T>(yLocal);
+            outQueueIdx.EnQue<uint32_t>(idxLocal);
             CopyOut(i);
         }
     }
 
 private:
-    __aicore__ inline void CopyIn(int32_t progress)
+    __aicore__ inline void CopyIn(int32_t i, int32_t j)
     {
-        LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
+        xLocal = inQueueX.AllocTensor<T>();
+        DataCopy(xLocal, xGm[(i * this->oneRepeatCalcount + j) * this->colNum], this->colCalcount);
         inQueueX.EnQue(xLocal);
     }
-    __aicore__ inline void Compute(int32_t progress, uint32_t innerTask)
+    __aicore__ inline void Compute(int32_t j)
     {
-        LocalTensor<T> xLocal = inQueueX.DeQue<T>();
-        LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
-        LocalTensor<uint32_t> idxLocal = outQueueIdx.AllocTensor<uint32_t>();
-
-        LocalTensor<T> yTmpLocal = yTmp.Get<T>();
-        LocalTensor<T> workTmpLocal = workTmp.Get<T>();
-        for (int32_t i = 0; i < innerTask; i++) {
-            DataCopy(xLocal, xGm[(progress * this->oneRepeatCalcount + i) * this->colNum], this->colCalcount);
-            ReduceMax(yTmpLocal, xLocal, workTmpLocal, this->colCalcount, true);
-            yLocal.SetValue(i, yTmpLocal.GetValue(0));
-            T indexVal = yTmpLocal.GetValue(1);
-            uint32_t index = 0;
-            index = *reinterpret_cast<uint32_t*>(&indexVal);
-            idxLocal.SetValue(1, index);
-        }
-        outQueueY.EnQue<T>(yLocal);
-        outQueueIdx.EnQue<uint32_t>(idxLocal);
+        xLocal = inQueueX.DeQue<T>();
+        ReduceMax(yTmpLocal, xLocal, workTmpLocal, this->colCalcount, true);
+        yLocal.SetValue(j, yTmpLocal.GetValue(0));
+        T indexVal = yTmpLocal.GetValue(1);
+        uint32_t index = 0;
+        index = *reinterpret_cast<uint32_t*>(&indexVal);
+        idxLocal.SetValue(j, index);
         inQueueX.FreeTensor(xLocal);
     }
-    __aicore__ inline void CopyOut(int32_t progress)
+    __aicore__ inline void CopyOut(int32_t i)
     {
-        LocalTensor<T> yLocal = outQueueY.DeQue<T>();
-        LocalTensor<uint32_t> idxLocal = outQueueIdx.DeQue<uint32_t>();
-        DataCopy(yGm[progress * this->oneRepeatCalcount], yLocal, this->oneRepeatCalcount);
-        DataCopy(idxGm[progress * this->oneRepeatCalcount], idxLocal, this->oneRepeatCalcount);
+        yLocal = outQueueY.DeQue<T>();
+        idxLocal = outQueueIdx.DeQue<uint32_t>();
+        DataCopy(yGm[i * this->oneRepeatCalcount], yLocal, this->oneRepeatCalcount);
+        DataCopy(idxGm[i * this->oneRepeatCalcount], idxLocal, this->oneRepeatCalcount);
         outQueueY.FreeTensor(yLocal);
         outQueueIdx.FreeTensor(idxLocal);
     }
@@ -95,10 +94,13 @@ private:
     TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueY, outQueueIdx;
 
     TBuf<QuePosition::VECCALC> yTmp, workTmp;
-    GlobalTensor<T> xGm;
-    GlobalTensor<T> yGm;
+    GlobalTensor<T> xGm, yGm;
     GlobalTensor<uint32_t> idxGm;
 
+    LocalTensor<T> yTmpLocal, workTmpLocal;
+    LocalTensor<T> xLocal, yLocal;
+    LocalTensor<uint32_t> idxLocal;
+
     int32_t colCalcount;
     int32_t oneRepeatCalcount;
     int32_t colNum;
-- 
Gitee


From f9062e1d2e80facfafde67c14dbe18cd5d817a42 Mon Sep 17 00:00:00 2001
From: liuhanwen8 <liuhanwen8@huawei.com>
Date: Thu, 7 Sep 2023 09:53:06 +0800
Subject: [PATCH 4/6] fix lint

---
 .../acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp       | 2 +-
 .../4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
index 09eb6ba9d..05d53edbe 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
@@ -52,7 +52,7 @@ public:
             idxLocal = outQueueIdx.AllocTensor<uint32_t>();
             for (int32_t j = 0; j < innerTask; j++) {
                 CopyIn(i, j);
-                Compute();
+                Compute(j);
             }
             outQueueY.EnQue<T>(yLocal);
             outQueueIdx.EnQue<uint32_t>(idxLocal);
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
index 3a7466f08..039ca4812 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
@@ -96,7 +96,7 @@ function main() {
     fi
 
     # PTA自定义算子注册
-    FUNCTION_REGISTE_FIELD=`pytorch_patch/npu_native_functions.yaml`
+    FUNCTION_REGISTE_FIELD="pytorch_patch/npu_native_functions.yaml"
     FUNCTION_REGISTE_FILE="${PTA_DIR}/torch_npu/csrc/aten/npu_native_functions.yaml"
     cat $FUNCTION_REGISTE_FIELD | while read line
     do
-- 
Gitee


From 2411ec73e4aeb10f8360f16092bf17258309c758 Mon Sep 17 00:00:00 2001
From: liuhanwen8 <liuhanwen8@huawei.com>
Date: Thu, 7 Sep 2023 11:31:23 +0800
Subject: [PATCH 5/6] delete summary check

---
 .../pytorch_invocation/run.sh                 | 64 ++++---------------
 1 file changed, 11 insertions(+), 53 deletions(-)

diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
index 039ca4812..8928257ff 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
@@ -120,60 +120,18 @@ function main() {
 
     # 解析dump文件为numpy文件
     files=$(ls ./prof_total)
-    cd $CURRENT_DIR/prof_total/$files
-    msprof --export=on --output=$CURRENT_DIR/prof_total/$files
-    if [[ $? -eq 0 ]];then
-        echo "INFO: parse success"
-    else
-        echo "ERROR: pasrse failed"
-        return 1
-    fi
-
-    # 校验summary文件夹
-    summary_list=(
-        acl_0_1.csv
-        acl_statistic_0_1.csv
-        ge_op_execute_0_1.csv
-        op_statistic_0_1.csv
-        op_summary_0_1.csv
-        prof_rule_0.json
-        runtime_api_0_1.csv
-        task_time_0_1.csv
-    )
-    if [ $(ls ./device_*/summary/ | wc -l) -eq ${#summary_list[@]} ];then
-        for summary in ${summary_list[@]}; do
-            if [ ! -f $(pwd)/device_0/summary/$summary ];then
-                echo "ERROR: summary files not exist"
-                return 1
-            fi
-        done
-        echo "INFO: All summary result exist"
-    else
-        echo "ERROR: check summary result fail"
-        return 1
-    fi
+    for line in $files;
+    do 
+        cd $CURRENT_DIR/prof_total/$line
+        msprof --export=on --output=$CURRENT_DIR/prof_total/$line
+        if [[ $? -eq 0 ]];then
+            echo "INFO: parse success"
+        else
+            echo "ERROR: pasrse failed"
+            return 1
+        fi
+    done
 
-    # 校验timeline文件夹
-    timeline_list=(
-        acl_0_1.json
-        ge_op_execute_0_1.json
-        msprof_0_1.json
-        runtime_api_0_1.json
-        task_time_0_1.json
-        thread_group_0_1.json
-    )
-    if [ $(ls ./device_*/timeline/ | wc -l) -eq ${#timeline_list[@]} ];then
-        for timeline in ${timeline_list[@]}; do
-            if [ ! -f $(pwd)/device_0/timeline/$timeline ];then
-                echo "ERROR: timeline files not exist"
-                return 1
-            fi
-        done
-        echo "INFO: timeline files exist"
-    else
-       echo "ERROR: timeline files not exist"
-       return 1
-    fi
     echo "INFO: Test Ascend C Custom Op SUCCESS"
 }
 
-- 
Gitee


From 546b4750462b4a55e86bdf1c7e181ac2e6ea85b6 Mon Sep 17 00:00:00 2001
From: liuhanwen8 <liuhanwen8@huawei.com>
Date: Thu, 7 Sep 2023 14:22:00 +0800
Subject: [PATCH 6/6] add summary check

---
 .../pytorch_invocation/run.sh                 | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
index 8928257ff..97d52d2a4 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
@@ -56,6 +56,51 @@ print(get_soc_version())
     SOC_SHORT_VERSION=`echo $SOC_VERSION_CONCAT | cut -d ',' -f 2`
 }
 
+# 校验summary文件夹
+function check_summary() {
+    summary_filename_list=(
+        api_statistic_0_1
+        op_statistic_0_1
+        op_summary_0_1
+        prof_rule_0
+        task_time_0_1
+    )
+    files=$(ls ./device_*/summary/)
+    if [ $(ls ./device_*/summary/ | wc -l) -eq ${#summary_filename_list[@]} ];then
+        for filename in ${summary_filename_list[@]}; do
+            if [[ *$filename* == $files ]];then
+                echo "ERROR: summary files not exist"
+                return 1
+            fi
+        done
+        echo "INFO: All summary result exist"
+    else
+        echo "ERROR: check summary result fail"
+        return 1
+    fi
+}
+
+# 校验timeline文件夹
+function check_timeline() {
+    timeline_filename_list=(
+        msprof_0_1
+        task_time_0_1
+    )
+    files=$(ls ./device_*/timeline/)
+    if [ $(ls ./device_*/timeline/ | wc -l) -eq ${#timeline_filename_list[@]} ];then
+        for filename in ${timeline_filename_list[@]}; do
+            if [[ *$filename* == $files ]];then
+                echo "ERROR: timeline files not exist"
+                return 1
+            fi
+        done
+        echo "INFO: All timeline result exist"
+    else
+        echo "ERROR: check timeline result fail"
+        return 1
+    fi
+}
+
 function main() {
     # 清除遗留生成文件和日志文件
     rm -rf $HOME/ascend/log/*
@@ -130,6 +175,9 @@ function main() {
             echo "ERROR: pasrse failed"
             return 1
         fi
+
+        check_summary
+        check_timeline
     done
 
     echo "INFO: Test Ascend C Custom Op SUCCESS"
-- 
Gitee