From ca69b4c80ab021272a5bb96ac42bb865187aee44 Mon Sep 17 00:00:00 2001
From: ascend <root@ubuntu.com>
Date: Sat, 3 Jun 2023 12:13:39 +0800
Subject: [PATCH] feat: use torch cache tensor

---
 core/base/ops_runner.cpp                      | 158 +++++++++++-------
 core/base/plan.cpp                            |   2 +-
 core/base/runner.cpp                          |  28 +++-
 core/include/acltransformer/base/ops_runner.h |  14 +-
 core/include/acltransformer/operation.h       |   2 +-
 .../acltransformer/ops/add_norm_operation.h   |   2 +-
 .../acltransformer/ops/add_operation.h        |   2 +-
 .../acltransformer/ops/ffn_operation.h        |   6 +-
 .../acltransformer/ops/linear_operation.h     |   6 +-
 .../ops/self_attention_operation.h            |   2 +-
 core/include/acltransformer/runner.h          |  12 +-
 .../acltransformer/utils/tensor_util.h        |   3 +-
 core/ops/add/add_operation.cpp                |   2 +-
 core/ops/add/add_ops_runner.cpp               |   2 +-
 core/ops/add/add_ops_runner.h                 |   4 +-
 core/ops/add/add_torch_runner.cpp             |  17 +-
 core/ops/add/add_torch_runner.h               |   4 +-
 core/ops/add_norm/add_norm_operation.cpp      |   2 +-
 core/ops/add_norm/add_norm_ops_runner.cpp     |  12 +-
 core/ops/add_norm/add_norm_ops_runner.h       |   8 +-
 core/ops/add_norm/add_norm_torch_runner.cpp   |  25 ++-
 core/ops/add_norm/add_norm_torch_runner.h     |   4 +-
 core/ops/ffn/ffn_operation.cpp                |   6 +-
 core/ops/ffn/ffn_ops_runner.cpp               |   8 +-
 core/ops/ffn/ffn_ops_runner.h                 |   3 +
 core/ops/ffn/ffn_torch_runner.cpp             |  17 +-
 core/ops/ffn/ffn_torch_runner.h               |   5 +-
 core/ops/linear/linear_operation.cpp          |   6 +-
 core/ops/linear/linear_ops_runner.cpp         |   8 +-
 core/ops/linear/linear_ops_runner.h           |   6 +-
 core/ops/linear/linear_torch_runner.cpp       |  11 +-
 core/ops/linear/linear_torch_runner.h         |   4 +-
 .../self_attention_operation.cpp              |   2 +-
 .../self_attention_ops_runner.cpp             |   5 +
 .../self_attention_ops_runner.h               |   3 +
 .../self_attention_torch_runner.cpp           |  18 +-
 .../self_attention_torch_runner.h             |   4 +-
 core/utils/tensor_util.cpp                    |  21 ++-
 .../{ops => }/operation/operation_creator.cpp |  29 +++-
 .../{ops => }/operation/operation_creator.h   |   0
 .../{ops => }/operation/operation_torch.cpp   |  62 ++++++-
 .../{ops => }/operation/operation_torch.h     |   9 +-
 .../test_add_norm_operation.py}               |  17 +-
 .../test_add_norm_operation_data.py}          |   0
 .../test_add_operation.py}                    |  24 +--
 .../test_linear_operation.py}                 |  13 +-
 .../test_linear_operation_data.py}            |   0
 .../torch/ops/add/add_operation_torch.cpp     |  53 ------
 examples/torch/ops/add/add_operation_torch.h  |  37 ----
 .../ops/add_norm/add_norm_operation_torch.cpp |  60 -------
 .../ops/add_norm/add_norm_operation_torch.h   |  37 ----
 .../add_norm/test_add_norm_operation_rand.py  |  55 ------
 .../torch/ops/ffn/ffn_operation_torch.cpp     |  65 -------
 examples/torch/ops/ffn/ffn_operation_torch.h  |  34 ----
 .../torch/ops/ffn/test_ffn_operation_torch.py |  83 ---------
 .../ops/linear/linear_operation_torch.cpp     |  79 ---------
 .../torch/ops/linear/linear_operation_torch.h |  35 ----
 .../test_linear_operation_torch_rand.py       |  51 ------
 .../self_attention_operation_torch.cpp        |  68 --------
 .../self_attention_operation_torch.h          |  40 -----
 .../test_self_attention_operation_torch.py    |  75 ---------
 tests/unittest/ops/add/test_add.cpp           |   4 +-
 62 files changed, 380 insertions(+), 994 deletions(-)
 rename examples/torch/{ops => }/operation/operation_creator.cpp (68%)
 rename examples/torch/{ops => }/operation/operation_creator.h (100%)
 rename examples/torch/{ops => }/operation/operation_torch.cpp (58%)
 rename examples/torch/{ops => }/operation/operation_torch.h (72%)
 rename examples/torch/{ops/operation/test_add_norm_operation_rand.py => operation/test_add_norm_operation.py} (81%)
 rename examples/torch/{ops/add_norm/test_add_norm_operation_torch.py => operation/test_add_norm_operation_data.py} (100%)
 rename examples/torch/{ops/add/test_add_operation_torch.py => operation/test_add_operation.py} (72%)
 rename examples/torch/{ops/operation/test_linear_operation_torch_rand.py => operation/test_linear_operation.py} (75%)
 rename examples/torch/{ops/linear/test_linear_operation_torch.py => operation/test_linear_operation_data.py} (100%)
 delete mode 100644 examples/torch/ops/add/add_operation_torch.cpp
 delete mode 100644 examples/torch/ops/add/add_operation_torch.h
 delete mode 100644 examples/torch/ops/add_norm/add_norm_operation_torch.cpp
 delete mode 100644 examples/torch/ops/add_norm/add_norm_operation_torch.h
 delete mode 100644 examples/torch/ops/add_norm/test_add_norm_operation_rand.py
 delete mode 100644 examples/torch/ops/ffn/ffn_operation_torch.cpp
 delete mode 100644 examples/torch/ops/ffn/ffn_operation_torch.h
 delete mode 100644 examples/torch/ops/ffn/test_ffn_operation_torch.py
 delete mode 100644 examples/torch/ops/linear/linear_operation_torch.cpp
 delete mode 100644 examples/torch/ops/linear/linear_operation_torch.h
 delete mode 100644 examples/torch/ops/linear/test_linear_operation_torch_rand.py
 delete mode 100644 examples/torch/ops/self_attention/self_attention_operation_torch.cpp
 delete mode 100644 examples/torch/ops/self_attention/self_attention_operation_torch.h
 delete mode 100644 examples/torch/ops/self_attention/test_self_attention_operation_torch.py

diff --git a/core/base/ops_runner.cpp b/core/base/ops_runner.cpp
index a1c29020..0ed2000d 100644
--- a/core/base/ops_runner.cpp
+++ b/core/base/ops_runner.cpp
@@ -71,24 +71,29 @@ OpsRunner::~OpsRunner()
     }
 }
 
-AsdOps::Status OpsRunner::Setup(VariantPack &variantPack)
+AsdOps::Status OpsRunner::SetupImpl(const VariantPack &variantPack)
 {
+    AsdOps::Status st = SetupKernelGraph(variantPack);
+    if (!st.Ok()) {
+        return st;
+    }
+
     InitTensorMaxNodeMap();
     ASD_LOG(INFO) << GetName() << " Setup start, kernel graph:" << kernelGraph_.ToString();
     Reset();
 
-    if (!PlanKernel(variantPack)) {
-        ASD_LOG(ERROR) << GetName() << " PlanKernel fail";
-        return AsdOps::Status::FailStatus(1, "PlanKernel fail");
+    if (!PlanKernelGraph(variantPack)) {
+        ASD_LOG(ERROR) << GetName() << " PlanKernelGraph fail";
+        return AsdOps::Status::FailStatus(1, "PlanKernelGraph fail");
     }
 
     FillTilingData(variantPack);
     return AsdOps::Status::OkStatus();
 }
 
-uint64_t OpsRunner::GetWorkspaceSize() { return intermediateSize_ + tilingData_.size() + workspaceSize_; }
+uint64_t OpsRunner::GetWorkspaceSizeImpl() { return intermediateSize_ + tilingData_.size() + workspaceSize_; }
 
-AsdOps::Status OpsRunner::Execute(Handle &handle, VariantPack &variantPack)
+AsdOps::Status OpsRunner::ExecuteImpl(Handle &handle, VariantPack &variantPack)
 {
     ASD_LOG(INFO) << GetName() << " execute start, intermediateSize:" << intermediateSize_
                   << ", tilingSize:" << tilingData_.size() << ", workspaceSize:" << workspaceSize_;
@@ -174,79 +179,100 @@ void OpsRunner::Reset()
     memAllocatinSolver_->Reset();
 }
 
-bool OpsRunner::PlanKernel(const VariantPack &variantPack)
+bool OpsRunner::PlanKernelGraph(const VariantPack &variantPack)
 {
     kernelGraph_.inTensors = variantPack.inTensors;
     kernelGraph_.outTensors = variantPack.outTensors;
 
     for (size_t nodeId = 0; nodeId < kernelGraph_.nodes.size(); ++nodeId) {
-        auto &node = kernelGraph_.nodes.at(nodeId);
-        const AsdOps::OpDesc &opDesc = node.opDesc;
-        AsdOps::Operation *op = AsdOps::Ops::Instance().GetOperationByName(opDesc.opName);
-        if (op == nullptr) {
-            ASD_LOG(ERROR) << GetName() << " get operation by name fail, opName:" << opDesc.opName;
+        if (!PlanOneKernel(nodeId)) {
             return false;
         }
+    }
 
-        node.kernelRunInfo.SetOpDesc(opDesc);
-        for (const auto tensorIt : node.inTensors) {
-            node.kernelRunInfo.AddInTensor(*tensorIt);
-        }
-        for (size_t i = 0; i < node.outTensors.size(); ++i) {
-            AsdOps::Tensor tensor;
-            node.kernelRunInfo.AddOutTensor(tensor);
-        }
-        ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " infer shape start, runinfo:\n"
-                      << AsdOpsRunInfoToString(node.kernelRunInfo);
-        AsdOps::Status st = op->InferShape(node.kernelRunInfo);
-        if (!st.Ok()) {
-            ASD_LOG(ERROR) << opDesc.opName << " infer shape fail, error:" << st.Message();
-            return false;
-        }
-        ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " infer shape success, runinfo:\n"
-                      << AsdOpsRunInfoToString(node.kernelRunInfo);
+    intermediateSize_ = memAllocatinSolver_->GetSize();
+    ASD_LOG(INFO) << GetName() << " MemAllocationSolver malloc size:" << memAllocatinSolver_->GetMallocSize()
+                  << ", real size:" << intermediateSize_;
 
-        for (size_t i = 0; i < node.outTensors.size(); ++i) {
-            AsdOps::Tensor *outTensor = node.outTensors.at(i);
-            AsdOps::Tensor &runInfoOutTensor = node.kernelRunInfo.GetOutTensor(i);
-            if (IsInternalTensor(outTensor)) {
-                outTensor->desc = runInfoOutTensor.desc;
-                outTensor->dataSize = CalcTensorDataSize(runInfoOutTensor);
-                outTensor->data = memAllocatinSolver_->Malloc(outTensor->dataSize);
-            }
-            runInfoOutTensor = *outTensor;
-        }
+    return true;
+}
 
-        ASD_LOG(INFO) << GetName() << " after mem allo solver, runinfo:\n" << AsdOpsRunInfoToString(node.kernelRunInfo);
+bool OpsRunner::PlanOneKernel(size_t nodeId)
+{
+    auto &node = kernelGraph_.nodes.at(nodeId);
+    const AsdOps::OpDesc &opDesc = node.opDesc;
 
-        AsdOps::Tactic *tactic = op->GetBestTactic(node.kernelRunInfo);
-        if (tactic == nullptr) {
-            ASD_LOG(ERROR) << GetName() << " " << opDesc.opName
-                           << " get best tactic fail, tactic count:" << op->GetTacticCount();
-            return false;
-        }
+    AsdOps::Operation *op = AsdOps::Ops::Instance().GetOperationByName(opDesc.opName);
+    if (op == nullptr) {
+        ASD_LOG(ERROR) << GetName() << " get operation by name fail, opName:" << opDesc.opName;
+        return false;
+    }
 
-        node.kernel = tactic->GetBestKernel(node.kernelRunInfo);
-        if (node.kernel == nullptr) {
-            ASD_LOG(ERROR) << GetName() << " " << tactic->GetName()
-                           << " get best kernel fail, kernel count:" << tactic->GetKernelCount();
-            return false;
-        }
-        ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " best tactic:" << tactic->GetName()
-                      << ", best kernel:" << node.kernel->GetName();
+    node.kernelRunInfo.SetOpDesc(opDesc);
+    for (const auto tensorIt : node.inTensors) {
+        node.kernelRunInfo.AddInTensor(*tensorIt);
+    }
+    for (size_t i = 0; i < node.outTensors.size(); ++i) {
+        AsdOps::Tensor tensor;
+        node.kernelRunInfo.AddOutTensor(tensor);
+    }
 
-        auto it = maxNodeIdTensorMap_.find(nodeId);
-        if (it != maxNodeIdTensorMap_.end()) {
-            for (auto tensorIt : it->second) {
-                memAllocatinSolver_->Free((char *)tensorIt->data);
+    ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " infer shape start, runinfo:\n"
+                  << AsdOpsRunInfoToString(node.kernelRunInfo);
+    AsdOps::Status st = op->InferShape(node.kernelRunInfo);
+    if (!st.Ok()) {
+        ASD_LOG(ERROR) << opDesc.opName << " infer shape fail, error:" << st.Message();
+        return false;
+    }
+    ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " infer shape success, runinfo:\n"
+                  << AsdOpsRunInfoToString(node.kernelRunInfo);
+
+    for (size_t i = 0; i < node.outTensors.size(); ++i) {
+        AsdOps::Tensor *outTensor = node.outTensors.at(i);
+        AsdOps::Tensor &runInfoOutTensor = node.kernelRunInfo.GetOutTensor(i);
+        if (IsInternalTensor(outTensor)) {
+            if (runInfoOutTensor.desc.dims.size() != 0) {
+                outTensor->desc = runInfoOutTensor.desc;
+            } else {
+                ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " outTensors[" << i
+                              << "] is internal tensor, infer shape wrong, not use infer shape desc";
             }
+            outTensor->dataSize = CalcTensorDataSize(outTensor->desc);
+            outTensor->data = memAllocatinSolver_->Malloc(outTensor->dataSize);
+            ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " outTensors[" << i
+                          << "] is internal tensor, mem solve:" << outTensor->data;
+        } else {
+            ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " outTensors[" << i << "] is not internal tensor";
         }
+        runInfoOutTensor = *outTensor;
     }
 
-    intermediateSize_ = memAllocatinSolver_->GetSize();
-    ASD_LOG(INFO) << GetName() << " "
-                  << " MemAllocationSolver malloc size:" << memAllocatinSolver_->GetMallocSize()
-                  << ", real size:" << intermediateSize_;
+    ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " after mem allo solver, runinfo:\n"
+                  << AsdOpsRunInfoToString(node.kernelRunInfo);
+
+    AsdOps::Tactic *tactic = op->GetBestTactic(node.kernelRunInfo);
+    if (tactic == nullptr) {
+        ASD_LOG(ERROR) << GetName() << " " << opDesc.opName
+                       << " get best tactic fail, tactic count:" << op->GetTacticCount();
+        return false;
+    }
+
+    node.kernel = tactic->GetBestKernel(node.kernelRunInfo);
+    if (node.kernel == nullptr) {
+        ASD_LOG(ERROR) << GetName() << " " << tactic->GetName()
+                       << " get best kernel fail, kernel count:" << tactic->GetKernelCount();
+        return false;
+    }
+    ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " get best tactic:" << tactic->GetName()
+                  << ", best kernel:" << node.kernel->GetName();
+
+    auto it = maxNodeIdTensorMap_.find(nodeId);
+    if (it != maxNodeIdTensorMap_.end()) {
+        for (auto tensorIt : it->second) {
+            memAllocatinSolver_->Free((char *)tensorIt->data);
+            ASD_LOG(INFO) << GetName() << " " << opDesc.opName << " mem free:" << tensorIt->data;
+        }
+    }
 
     return true;
 }
@@ -271,7 +297,13 @@ void OpsRunner::FillTilingData(const VariantPack &variantPack)
             for (size_t j = 0; j < workspaces.size(); ++j) {
                 kernelWorkspaceSize += workspaces[i];
             }
-            maxKernelWorkspaceSize = std::max(maxKernelWorkspaceSize, kernelWorkspaceSize);
+            ASD_LOG(INFO) << GetName() << " " << kernel->GetName() << ", kernelWorkspaceSize:" << kernelWorkspaceSize
+                          << ", maxKernelWorkspaceSize:" << maxKernelWorkspaceSize;
+            if (kernelWorkspaceSize < 100000) {
+                maxKernelWorkspaceSize = std::max(maxKernelWorkspaceSize, kernelWorkspaceSize);
+            } else {
+                ASD_LOG(ERROR) << GetName() << " " << kernel->GetName() << " kernelWorkspaceSize too large, discard";
+            }
         }
     }
     workspaceSize_ = maxKernelWorkspaceSize;
diff --git a/core/base/plan.cpp b/core/base/plan.cpp
index 5b4a46b0..8381a96b 100644
--- a/core/base/plan.cpp
+++ b/core/base/plan.cpp
@@ -55,7 +55,7 @@ AsdOps::Status Plan::Setup(Handle handle, const VariantPack &variantPack)
         for (size_t i = 0; i < node.variantPack.inTensors.size(); ++i) {
             ASD_LOG(INFO) << "Plan intensor[" << i << "] " << AsdOpsTensorToString(node.variantPack.inTensors.at(i));
         }
-        std::vector<AsdOps::TensorDesc> outTensorDescs;
+        AsdOps::SVector<AsdOps::TensorDesc> outTensorDescs;
         node.operation->InferShape(node.variantPack.inTensors, outTensorDescs);
         for (size_t i = 0; i < outTensorDescs.size(); ++i) {
             ASD_LOG(INFO) << "Plan outTensorDescs[" << i << "] " << AsdOpsTensorDescToString(outTensorDescs.at(i));
diff --git a/core/base/runner.cpp b/core/base/runner.cpp
index c1133248..fd4b5d48 100644
--- a/core/base/runner.cpp
+++ b/core/base/runner.cpp
@@ -22,7 +22,31 @@ Runner::~Runner() {}
 
 std::string Runner::GetName() const { return name_; }
 
-AsdOps::Status Runner::Setup(VariantPack &variantPack) { return AsdOps::Status::OkStatus(); }
+AsdOps::Status Runner::Setup(const VariantPack &variantPack)
+{
+    AsdOps::Status st = IsConsistent(variantPack);
+    if (!st.Ok()) {
+        return st;
+    }
+    return SetupImpl(variantPack);
+}
 
-uint64_t Runner::GetWorkspaceSize() { return 0; }
+uint64_t Runner::GetWorkspaceSize() { return GetWorkspaceSizeImpl(); }
+
+AsdOps::Status Runner::Execute(Handle &handle, VariantPack &variantPack)
+{
+    AsdOps::Status st = IsConsistent(variantPack);
+    if (!st.Ok()) {
+        return st;
+    }
+    return ExecuteImpl(handle, variantPack);
+}
+
+AsdOps::Status Runner::IsConsistent(const VariantPack &variantPack) { return AsdOps::Status::OkStatus(); }
+
+AsdOps::Status Runner::SetupImpl(const VariantPack &variantPack) { return AsdOps::Status::OkStatus(); }
+
+uint64_t Runner::GetWorkspaceSizeImpl() { return 0; }
+
+AsdOps::Status Runner::ExecuteImpl(Handle &handle, VariantPack &variantPack) { return AsdOps::Status::OkStatus(); }
 } // namespace AclTransformer
\ No newline at end of file
diff --git a/core/include/acltransformer/base/ops_runner.h b/core/include/acltransformer/base/ops_runner.h
index ae731e90..4555430a 100644
--- a/core/include/acltransformer/base/ops_runner.h
+++ b/core/include/acltransformer/base/ops_runner.h
@@ -46,13 +46,19 @@ class OpsRunner : public Runner {
 public:
     OpsRunner(const std::string &name);
     virtual ~OpsRunner();
-    AsdOps::Status Setup(VariantPack &variantPack) override;
-    uint64_t GetWorkspaceSize() override;
-    AsdOps::Status Execute(Handle &handle, VariantPack &variantPack) override;
+
+protected:
+    AsdOps::Status SetupImpl(const VariantPack &variantPack) override;
+    uint64_t GetWorkspaceSizeImpl() override;
+    AsdOps::Status ExecuteImpl(Handle &handle, VariantPack &variantPack) override;
+
+protected:
+    virtual AsdOps::Status SetupKernelGraph(const VariantPack &variantPack) = 0;
 
 private:
     void Reset();
-    bool PlanKernel(const VariantPack &variantPack);
+    bool PlanKernelGraph(const VariantPack &variantPack);
+    bool PlanOneKernel(size_t nodeId);
     void FillTilingData(const VariantPack &variantPack);
     void InitTensorMaxNodeMap();
     bool IsInternalTensor(const AsdOps::Tensor *tensor);
diff --git a/core/include/acltransformer/operation.h b/core/include/acltransformer/operation.h
index 5c01b620..bfcba213 100644
--- a/core/include/acltransformer/operation.h
+++ b/core/include/acltransformer/operation.h
@@ -32,7 +32,7 @@ public:
     virtual ~Operation();
     std::string GetName() const;
     virtual AsdOps::Status InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                                      std::vector<AsdOps::TensorDesc> &outTensorDescs) = 0;
+                                      AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) = 0;
     AsdOps::Status Setup(VariantPack &variantPack);
     uint64_t GetWorkspaceSize();
     AsdOps::Status Execute(Handle &handle, VariantPack &variantPack);
diff --git a/core/include/acltransformer/ops/add_norm_operation.h b/core/include/acltransformer/ops/add_norm_operation.h
index 6a371355..180a8f64 100644
--- a/core/include/acltransformer/ops/add_norm_operation.h
+++ b/core/include/acltransformer/ops/add_norm_operation.h
@@ -25,7 +25,7 @@ public:
     AddNormOperation(const AddNormParam &param);
     virtual ~AddNormOperation();
     AsdOps::Status InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                              std::vector<AsdOps::TensorDesc> &outTensorDescs) override;
+                              AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) override;
 
 protected:
     RunnerBuilder *FindBestRunnerBuilder(const VariantPack &variantPack) override;
diff --git a/core/include/acltransformer/ops/add_operation.h b/core/include/acltransformer/ops/add_operation.h
index dd6eba5f..14b0c122 100644
--- a/core/include/acltransformer/ops/add_operation.h
+++ b/core/include/acltransformer/ops/add_operation.h
@@ -24,7 +24,7 @@ public:
     AddOperation(const AddParam &param);
     ~AddOperation();
     AsdOps::Status InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                              std::vector<AsdOps::TensorDesc> &outTensorDescs) override;
+                              AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) override;
 
 protected:
     RunnerBuilder *FindBestRunnerBuilder(const VariantPack &variantPack) override;
diff --git a/core/include/acltransformer/ops/ffn_operation.h b/core/include/acltransformer/ops/ffn_operation.h
index 7faf6423..7f49c2d4 100644
--- a/core/include/acltransformer/ops/ffn_operation.h
+++ b/core/include/acltransformer/ops/ffn_operation.h
@@ -25,9 +25,9 @@ public:
     FfnOperation(const FfnParam &param);
     virtual ~FfnOperation();
     AsdOps::Status InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                              std::vector<AsdOps::TensorDesc> &outTensorDescs) override;
-    bool IsConsistent(const std::vector<AsdOps::TensorDesc> &inTensorDescs,
-                      std::vector<AsdOps::TensorDesc> &outTensorDescs) const;
+                              AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) override;
+    bool IsConsistent(const AsdOps::SVector<AsdOps::TensorDesc> &inTensorDescs,
+                      AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) const;
     int64_t GetTensorBatch(const AsdOps::TensorDesc &tensorDesc) const;
     int64_t GetTensorH(const AsdOps::TensorDesc &tensorDesc) const;
     int64_t GetTensorW(const AsdOps::TensorDesc &tensorDesc) const;
diff --git a/core/include/acltransformer/ops/linear_operation.h b/core/include/acltransformer/ops/linear_operation.h
index cd15c1b9..53442472 100644
--- a/core/include/acltransformer/ops/linear_operation.h
+++ b/core/include/acltransformer/ops/linear_operation.h
@@ -25,9 +25,9 @@ public:
     LinearOperation(const LinearParam &param);
     virtual ~LinearOperation();
     AsdOps::Status InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                              std::vector<AsdOps::TensorDesc> &outTensorDescs) override;
-    bool IsConsistent(const std::vector<AsdOps::TensorDesc> &inTensorDescs,
-                      std::vector<AsdOps::TensorDesc> &outTensorDescs) const;
+                              AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) override;
+    bool IsConsistent(const AsdOps::SVector<AsdOps::TensorDesc> &inTensorDescs,
+                      AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) const;
     int64_t GetTensorBatch(const AsdOps::TensorDesc &tensorDesc) const;
     int64_t GetTensorH(const AsdOps::TensorDesc &tensorDesc) const;
     int64_t GetTensorW(const AsdOps::TensorDesc &tensorDesc) const;
diff --git a/core/include/acltransformer/ops/self_attention_operation.h b/core/include/acltransformer/ops/self_attention_operation.h
index 3045a71f..a7128b70 100644
--- a/core/include/acltransformer/ops/self_attention_operation.h
+++ b/core/include/acltransformer/ops/self_attention_operation.h
@@ -25,7 +25,7 @@ public:
     SelfAttentionOperation(const SelfAttentionParam &param);
     virtual ~SelfAttentionOperation();
     AsdOps::Status InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                              std::vector<AsdOps::TensorDesc> &outTensorDescs) override;
+                              AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) override;
 
 protected:
     RunnerBuilder *FindBestRunnerBuilder(const VariantPack &variantPack) override;
diff --git a/core/include/acltransformer/runner.h b/core/include/acltransformer/runner.h
index c69c01a4..9ad2ebc2 100644
--- a/core/include/acltransformer/runner.h
+++ b/core/include/acltransformer/runner.h
@@ -26,9 +26,15 @@ public:
     Runner(const std::string &name);
     virtual ~Runner();
     std::string GetName() const;
-    virtual AsdOps::Status Setup(VariantPack &variantPack);
-    virtual uint64_t GetWorkspaceSize();
-    virtual AsdOps::Status Execute(Handle &handle, VariantPack &variantPack) = 0;
+    AsdOps::Status Setup(const VariantPack &variantPack);
+    uint64_t GetWorkspaceSize();
+    AsdOps::Status Execute(Handle &handle, VariantPack &variantPack);
+
+private:
+    virtual AsdOps::Status IsConsistent(const VariantPack &variantPack);
+    virtual AsdOps::Status SetupImpl(const VariantPack &variantPack);
+    virtual uint64_t GetWorkspaceSizeImpl();
+    virtual AsdOps::Status ExecuteImpl(Handle &handle, VariantPack &variantPack);
 
 private:
     std::string name_;
diff --git a/core/include/acltransformer/utils/tensor_util.h b/core/include/acltransformer/utils/tensor_util.h
index 21af0edf..d47ceda7 100644
--- a/core/include/acltransformer/utils/tensor_util.h
+++ b/core/include/acltransformer/utils/tensor_util.h
@@ -23,8 +23,9 @@
 #include "acltransformer/handle.h"
 
 namespace AclTransformer {
-void GetTensorDescs(const std::vector<AsdOps::Tensor> &tensors, std::vector<AsdOps::TensorDesc> &tensorDescs);
+void GetTensorDescs(const std::vector<AsdOps::Tensor> &tensors, AsdOps::SVector<AsdOps::TensorDesc> &tensorDescs);
 uint64_t CalcTensorDataSize(const AsdOps::Tensor &tensor);
+uint64_t CalcTensorDataSize(const AsdOps::TensorDesc &tensorDesc);
 at::Tensor AsdOpsTensor2AtTensor(Handle handle, const AsdOps::Tensor &asdTensor);
 at::Tensor AsdOpsTensor2AtTensorCache(Handle handle, const AsdOps::Tensor &asdTensor);
 at::Tensor AsdOpsTensor2AtCpuTensor(Handle handle, const AsdOps::Tensor &asdTensor);
diff --git a/core/ops/add/add_operation.cpp b/core/ops/add/add_operation.cpp
index bd530c96..2a203aac 100644
--- a/core/ops/add/add_operation.cpp
+++ b/core/ops/add/add_operation.cpp
@@ -27,7 +27,7 @@ AddOperation::AddOperation(const AddParam &param) : Operation("AddOperation"), p
 AddOperation::~AddOperation() {}
 
 AsdOps::Status AddOperation::InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                                        std::vector<AsdOps::TensorDesc> &outTensorDescs)
+                                        AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs)
 {
     if (inTensors.size() != 2) {
         return AsdOps::Status::FailStatus(1, "inTensorDescs size is not 2");
diff --git a/core/ops/add/add_ops_runner.cpp b/core/ops/add/add_ops_runner.cpp
index 9b8a9052..67d1b870 100644
--- a/core/ops/add/add_ops_runner.cpp
+++ b/core/ops/add/add_ops_runner.cpp
@@ -27,7 +27,7 @@ AddOpsRunner::AddOpsRunner(const AddParam &param) : OpsRunner("AddOpsRunner"), p
 
 AddOpsRunner::~AddOpsRunner() {}
 
-AsdOps::Status AddOpsRunner::Setup(VariantPack &variantPack)
+AsdOps::Status AddOpsRunner::SetupKernelGraph(const VariantPack &variantPack)
 {
     if (param_.scale == 1) {
         kernelGraph_.inTensors.resize(2);
diff --git a/core/ops/add/add_ops_runner.h b/core/ops/add/add_ops_runner.h
index 8b811556..41aa6868 100644
--- a/core/ops/add/add_ops_runner.h
+++ b/core/ops/add/add_ops_runner.h
@@ -23,7 +23,9 @@ class AddOpsRunner : public OpsRunner {
 public:
     AddOpsRunner(const AddParam &param);
     virtual ~AddOpsRunner();
-    AsdOps::Status Setup(VariantPack &variantPack) override;
+
+protected:
+    AsdOps::Status SetupKernelGraph(const VariantPack &variantPack) override;
 
 private:
     AddParam param_;
diff --git a/core/ops/add/add_torch_runner.cpp b/core/ops/add/add_torch_runner.cpp
index 5ffffd5b..11697311 100644
--- a/core/ops/add/add_torch_runner.cpp
+++ b/core/ops/add/add_torch_runner.cpp
@@ -18,23 +18,20 @@
 #include <asdops/utils/log/log.h>
 #include <asdops/utils/rt/rt.h>
 #include "acltransformer/utils/tensor_util.h"
+#include "acltransformer/utils/tensor_cache.h"
 
 namespace AclTransformer {
 AddTorchRunner::AddTorchRunner(const AddParam &param) : Runner("AddTorchRunner"), param_(param) {}
 
 AddTorchRunner::~AddTorchRunner() {}
 
-AsdOps::Status AddTorchRunner::Execute(Handle &handle, VariantPack &variantPack)
+AsdOps::Status AddTorchRunner::ExecuteImpl(Handle &handle, VariantPack &variantPack)
 {
-    ASD_LOG(INFO) << GetName() << " Execute start";
-    at::Tensor atInTensorA = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[0]);
-    at::Tensor atInTensorB = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[1]);
-    at::Tensor addResultTensor = at::add(atInTensorA, atInTensorB).contiguous();
-    int ret = AsdRtMemCopyAsync(variantPack.outTensors[0].data, variantPack.outTensors[0].dataSize,
-                                addResultTensor.storage().data_ptr().get(), variantPack.outTensors[0].dataSize,
-                                ASDRT_MEMCOPY_DEVICE_TO_DEVICE, handle.stream);
-    ASD_LOG_IF(ret != 0, ERROR) << GetName() << " AsdRtMemCopy fail";
-    ASD_LOG(INFO) << GetName() << " Execute end";
+    at::Tensor *atInTensorA = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors.at(0).data);
+    at::Tensor *atInTensorB = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors.at(1).data);
+    at::Tensor *addResultTensor = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.outTensors.at(0).data);
+    *addResultTensor = torch::add(*atInTensorA, *atInTensorB);
+
     return AsdOps::Status::OkStatus();
 }
 } // namespace AclTransformer
\ No newline at end of file
diff --git a/core/ops/add/add_torch_runner.h b/core/ops/add/add_torch_runner.h
index 39a29184..b54e3f6a 100644
--- a/core/ops/add/add_torch_runner.h
+++ b/core/ops/add/add_torch_runner.h
@@ -23,7 +23,9 @@ class AddTorchRunner : public Runner {
 public:
     AddTorchRunner(const AddParam &param);
     virtual ~AddTorchRunner();
-    AsdOps::Status Execute(Handle &handle, VariantPack &variantPack) override;
+
+private:
+    AsdOps::Status ExecuteImpl(Handle &handle, VariantPack &variantPack) override;
 
 private:
     AddParam param_;
diff --git a/core/ops/add_norm/add_norm_operation.cpp b/core/ops/add_norm/add_norm_operation.cpp
index e930d55c..d12041c2 100644
--- a/core/ops/add_norm/add_norm_operation.cpp
+++ b/core/ops/add_norm/add_norm_operation.cpp
@@ -27,7 +27,7 @@ AddNormOperation::AddNormOperation(const AddNormParam &param) : Operation("AddNo
 AddNormOperation::~AddNormOperation() {}
 
 AsdOps::Status AddNormOperation::InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                                            std::vector<AsdOps::TensorDesc> &outTensorDescs)
+                                            AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs)
 {
     if (inTensors.size() != 4) {
         return AsdOps::Status::FailStatus(1, "inTensorDescs size is not 4");
diff --git a/core/ops/add_norm/add_norm_ops_runner.cpp b/core/ops/add_norm/add_norm_ops_runner.cpp
index 85538f67..8dbee900 100644
--- a/core/ops/add_norm/add_norm_ops_runner.cpp
+++ b/core/ops/add_norm/add_norm_ops_runner.cpp
@@ -28,7 +28,7 @@ AddNormOpsRunner::AddNormOpsRunner(const AddNormParam &param) : OpsRunner("AddNo
 
 AddNormOpsRunner::~AddNormOpsRunner() {}
 
-AsdOps::Status AddNormOpsRunner::Setup(VariantPack &variantPack)
+AsdOps::Status AddNormOpsRunner::SetupKernelGraph(const VariantPack &variantPack)
 {
     kernelGraph_.inTensors.resize(4);
     AsdOps::Tensor &xTensor = kernelGraph_.inTensors.at(0);
@@ -64,10 +64,10 @@ AsdOps::Status AddNormOpsRunner::Setup(VariantPack &variantPack)
     layerNormNode.inTensors = {&addNodeResultTensor, &weightTensor, &biasTensor};
     layerNormNode.outTensors = {&resultTensor, &layerNormMeanTensor, &layerNormVarianceTensor};
 
-    return OpsRunner::Setup(variantPack);
+    return AsdOps::Status::OkStatus();
 }
 
-bool AddNormOpsRunner::CalcLayerNormTensor(VariantPack &variantPack, int64_t &beginDim)
+bool AddNormOpsRunner::CalcLayerNormTensor(const VariantPack &variantPack, int64_t &beginDim)
 {
     AsdOps::TensorDesc inputDesc;
     inputDesc.dtype = variantPack.inTensors.at(0).desc.dtype;
@@ -77,8 +77,8 @@ bool AddNormOpsRunner::CalcLayerNormTensor(VariantPack &variantPack, int64_t &be
         inputDesc.dims = variantPack.inTensors.at(1).desc.dims;
     }
 
-    AsdOps::Tensor &weightTensor = variantPack.inTensors.at(2);
-    AsdOps::Tensor &biasTensor = variantPack.inTensors.at(3);
+    const AsdOps::Tensor &weightTensor = variantPack.inTensors.at(2);
+    const AsdOps::Tensor &biasTensor = variantPack.inTensors.at(3);
 
     ASD_LOG(INFO) << GetName() << " layer norm input desc:" << AsdOpsTensorDescToString(inputDesc)
                   << ", weightTensor:" << AsdOpsTensorToString(weightTensor)
@@ -93,6 +93,7 @@ bool AddNormOpsRunner::CalcLayerNormTensor(VariantPack &variantPack, int64_t &be
 
     ASD_LOG(INFO) << GetName() << " M:" << M;
     if (M < 0) {
+        layerNormMeanTensor.desc.format = inputDesc.format;
         layerNormMeanTensor.desc.dtype = inputDesc.dtype;
         layerNormMeanTensor.desc.dims = {M};
         layerNormVarianceTensor.desc = layerNormMeanTensor.desc;
@@ -114,6 +115,7 @@ bool AddNormOpsRunner::CalcLayerNormTensor(VariantPack &variantPack, int64_t &be
             break;
         }
     }
+    layerNormMeanTensor.desc.format = weightTensor.desc.format;
     layerNormMeanTensor.desc.dtype = weightTensor.desc.dtype;
     layerNormMeanTensor.desc.dims = reduceDims;
     layerNormVarianceTensor.desc = layerNormMeanTensor.desc;
diff --git a/core/ops/add_norm/add_norm_ops_runner.h b/core/ops/add_norm/add_norm_ops_runner.h
index 7cd73221..eb713582 100644
--- a/core/ops/add_norm/add_norm_ops_runner.h
+++ b/core/ops/add_norm/add_norm_ops_runner.h
@@ -23,12 +23,10 @@ class AddNormOpsRunner : public OpsRunner {
 public:
     AddNormOpsRunner(const AddNormParam &param);
     virtual ~AddNormOpsRunner();
-    AsdOps::Status Setup(VariantPack &variantPack) override;
-    // uint64_t GetWorkspaceSize() override;
-    // AsdOps::Status Execute(Handle &handle, VariantPack &variantPack) override;
 
-private:
-    bool CalcLayerNormTensor(VariantPack &variantPack, int64_t &beginDim);
+protected:
+    AsdOps::Status SetupKernelGraph(const VariantPack &variantPack) override;
+    bool CalcLayerNormTensor(const VariantPack &variantPack, int64_t &beginDim);
 
 private:
     AddNormParam param_;
diff --git a/core/ops/add_norm/add_norm_torch_runner.cpp b/core/ops/add_norm/add_norm_torch_runner.cpp
index 2608a504..ca6bbedb 100644
--- a/core/ops/add_norm/add_norm_torch_runner.cpp
+++ b/core/ops/add_norm/add_norm_torch_runner.cpp
@@ -20,30 +20,29 @@
 #include <asdops/utils/log/log.h>
 #include <asdops/utils/rt/rt.h>
 #include "acltransformer/utils/tensor_util.h"
+#include "acltransformer/utils/tensor_cache.h"
 
 namespace AclTransformer {
 AddNormTorchRunner::AddNormTorchRunner(const AddNormParam &param) : Runner("AddNormTorchRunner"), param_(param) {}
 
 AddNormTorchRunner::~AddNormTorchRunner() {}
 
-AsdOps::Status AddNormTorchRunner::Execute(Handle &handle, VariantPack &variantPack)
+AsdOps::Status AddNormTorchRunner::ExecuteImpl(Handle &handle, VariantPack &variantPack)
 {
     if (variantPack.inTensors.size() != 4) {
         return AsdOps::Status::FailStatus(1, "AddNormTorchRunner inTensor num error!");
     }
 
-    at::Tensor atInTensorA = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[0]);
-    at::Tensor atInTensorB = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[1]);
-    at::Tensor atInTensorWeight = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[2]);
-    at::Tensor atInTensorBias = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[3]);
-    at::Tensor addResultTensor = at::add(atInTensorA, atInTensorB);
-    const double eps = 1e-12;
-    at::Tensor outputTensor =
-        at::layer_norm(addResultTensor, atInTensorWeight.sizes(), atInTensorWeight, atInTensorBias, eps).contiguous();
-    int ret = AsdRtMemCopyAsync(variantPack.outTensors[0].data, variantPack.outTensors[0].dataSize,
-                                outputTensor.storage().data_ptr().get(), variantPack.outTensors[0].dataSize,
-                                ASDRT_MEMCOPY_DEVICE_TO_DEVICE, handle.stream);
-    ASD_LOG_IF(ret != 0, ERROR) << "AsdRtMemCopy fail";
+    at::Tensor *atInTensorA = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[0].data);
+    at::Tensor *atInTensorB = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[1].data);
+    at::Tensor *atInTensorWeight = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[2].data);
+    at::Tensor *atInTensorBias = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[3].data);
+    at::Tensor *atOutTensor = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.outTensors[0].data);
+
+    *atOutTensor = at::layer_norm(at::add(*atInTensorA, *atInTensorB), atInTensorWeight->sizes(), *atInTensorWeight,
+                                  *atInTensorBias, param_.layerNormEps)
+                       .contiguous();
+
     return AsdOps::Status::OkStatus();
 }
 } // namespace AclTransformer
\ No newline at end of file
diff --git a/core/ops/add_norm/add_norm_torch_runner.h b/core/ops/add_norm/add_norm_torch_runner.h
index 3be5c40c..5a2b3ab3 100644
--- a/core/ops/add_norm/add_norm_torch_runner.h
+++ b/core/ops/add_norm/add_norm_torch_runner.h
@@ -23,7 +23,9 @@ class AddNormTorchRunner : public Runner {
 public:
     AddNormTorchRunner(const AddNormParam &param);
     virtual ~AddNormTorchRunner();
-    AsdOps::Status Execute(Handle &handle, VariantPack &variantPack) override;
+
+protected:
+    AsdOps::Status ExecuteImpl(Handle &handle, VariantPack &variantPack) override;
 
 private:
     AddNormParam param_;
diff --git a/core/ops/ffn/ffn_operation.cpp b/core/ops/ffn/ffn_operation.cpp
index 75db261c..c26d9db3 100644
--- a/core/ops/ffn/ffn_operation.cpp
+++ b/core/ops/ffn/ffn_operation.cpp
@@ -33,7 +33,7 @@ FfnOperation::FfnOperation(const FfnParam &param) : Operation("FfnOperation"), p
 FfnOperation::~FfnOperation() {}
 
 AsdOps::Status FfnOperation::InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                                        std::vector<AsdOps::TensorDesc> &outTensorDescs)
+                                        AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs)
 {
     if (inTensors.size() != 3) {
         return AsdOps::Status::FailStatus(1, "inTensorDescs size is not 3");
@@ -47,8 +47,8 @@ AsdOps::Status FfnOperation::InferShape(const AsdOps::SVector<AsdOps::Tensor> &i
     return AsdOps::Status::OkStatus();
 }
 
-bool FfnOperation::IsConsistent(const std::vector<AsdOps::TensorDesc> &inTensorDescs,
-                                std::vector<AsdOps::TensorDesc> &outTensorDescs) const
+bool FfnOperation::IsConsistent(const AsdOps::SVector<AsdOps::TensorDesc> &inTensorDescs,
+                                AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) const
 {
     ASDOPS_CHECK_TRUE(inTensorDescs.size() == static_cast<size_t>(DIM_3), return false);
     ASDOPS_CHECK_TRUE(outTensorDescs.size() == static_cast<size_t>(DIM_1), return false);
diff --git a/core/ops/ffn/ffn_ops_runner.cpp b/core/ops/ffn/ffn_ops_runner.cpp
index 7a0fd165..e17cdde2 100644
--- a/core/ops/ffn/ffn_ops_runner.cpp
+++ b/core/ops/ffn/ffn_ops_runner.cpp
@@ -19,7 +19,9 @@
 #include <asdops/params/matmul.h>
 
 namespace AclTransformer {
-FfnOpsRunner::FfnOpsRunner(const FfnParam &param) : OpsRunner("FfnOpsRunner"), param_(param)
+FfnOpsRunner::FfnOpsRunner(const FfnParam &param) : OpsRunner("FfnOpsRunner"), param_(param) {}
+
+AsdOps::Status FfnOpsRunner::SetupKernelGraph(const VariantPack &variantPack)
 {
     kernelGraph_.inTensors.resize(3);
     AsdOps::Tensor &aTensor = kernelGraph_.inTensors[0];
@@ -63,9 +65,11 @@ FfnOpsRunner::FfnOpsRunner(const FfnParam &param) : OpsRunner("FfnOpsRunner"), p
     addNode.outTensors = {&addOutTensor};
 
     geluNode.opDesc = {0, "ActivationOperation",
-                     AsdOps::OpParam::Activation({AsdOps::OpParam::Activation::ACTIVATION_GELU})};
+                       AsdOps::OpParam::Activation({AsdOps::OpParam::Activation::ACTIVATION_GELU})};
     geluNode.inTensors = {&addOutTensor};
     geluNode.outTensors = {&operationOutTensor};
+
+    return AsdOps::Status::OkStatus();
 }
 
 FfnOpsRunner::~FfnOpsRunner() {}
diff --git a/core/ops/ffn/ffn_ops_runner.h b/core/ops/ffn/ffn_ops_runner.h
index c0f0195d..c1c93117 100644
--- a/core/ops/ffn/ffn_ops_runner.h
+++ b/core/ops/ffn/ffn_ops_runner.h
@@ -24,6 +24,9 @@ public:
     FfnOpsRunner(const FfnParam &param);
     virtual ~FfnOpsRunner();
 
+protected:
+    AsdOps::Status SetupKernelGraph(const VariantPack &variantPack) override;
+
 private:
     FfnParam param_;
 };
diff --git a/core/ops/ffn/ffn_torch_runner.cpp b/core/ops/ffn/ffn_torch_runner.cpp
index 1bbb6bea..d8d32a11 100644
--- a/core/ops/ffn/ffn_torch_runner.cpp
+++ b/core/ops/ffn/ffn_torch_runner.cpp
@@ -19,6 +19,7 @@
 #include <asdops/utils/rt/rt.h>
 #include <asdops/utils/log/log.h>
 #include "acltransformer/utils/tensor_util.h"
+#include "acltransformer/utils/tensor_cache.h"
 
 namespace AclTransformer {
 FfnTorchRunner::FfnTorchRunner(const FfnParam &param) : Runner("FfnTorchRunner"), param_(param)
@@ -28,20 +29,18 @@ FfnTorchRunner::FfnTorchRunner(const FfnParam &param) : Runner("FfnTorchRunner")
 
 FfnTorchRunner::~FfnTorchRunner() {}
 
-AsdOps::Status FfnTorchRunner::Execute(Handle &handle, VariantPack &variantPack)
+AsdOps::Status FfnTorchRunner::ExecuteImpl(Handle &handle, VariantPack &variantPack)
 {
     if (variantPack.inTensors.size() != 3) {
         return AsdOps::Status::FailStatus(1, "FfnTorchRunner inTensor num error!");
     }
-    at::Tensor atInTensorA = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[0]);
-    at::Tensor atInTensorWeight = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[1]);
-    at::Tensor atInTensorBias = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[2]);
-    at::Tensor outTensor = at::gelu(at::linear(atInTensorA, atInTensorWeight, atInTensorBias)).contiguous();
-    int ret = AsdRtMemCopyAsync(variantPack.outTensors[0].data, variantPack.outTensors[0].dataSize,
-                                outTensor.storage().data_ptr().get(), variantPack.outTensors[0].dataSize,
-                                ASDRT_MEMCOPY_DEVICE_TO_DEVICE, handle.stream);
+    at::Tensor *atInTensorA = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[0].data);
+    at::Tensor *atInTensorWeight = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[1].data);
+    at::Tensor *atInTensorBias = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[2].data);
+    at::Tensor *atOutTensor = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.outTensors[0].data);
+
+    *atOutTensor = at::gelu(at::linear(*atInTensorA, *atInTensorWeight, *atInTensorBias)).contiguous();
 
-    ASD_LOG_IF(ret != 0, ERROR) << "AsdRtMemCopy fail";
     return AsdOps::Status::OkStatus();
 }
 } // namespace AclTransformer
\ No newline at end of file
diff --git a/core/ops/ffn/ffn_torch_runner.h b/core/ops/ffn/ffn_torch_runner.h
index c53c074a..5b816f8d 100644
--- a/core/ops/ffn/ffn_torch_runner.h
+++ b/core/ops/ffn/ffn_torch_runner.h
@@ -23,8 +23,11 @@ class FfnTorchRunner : public Runner {
 public:
     FfnTorchRunner(const FfnParam &param);
     virtual ~FfnTorchRunner();
-    AsdOps::Status Execute(Handle &handle, VariantPack &variantPack) override;
 
+protected:
+    AsdOps::Status ExecuteImpl(Handle &handle, VariantPack &variantPack) override;
+
+private:
 private:
     FfnParam param_;
 };
diff --git a/core/ops/linear/linear_operation.cpp b/core/ops/linear/linear_operation.cpp
index 9adf3ea6..4b2c59cd 100644
--- a/core/ops/linear/linear_operation.cpp
+++ b/core/ops/linear/linear_operation.cpp
@@ -34,7 +34,7 @@ LinearOperation::LinearOperation(const LinearParam &param) : Operation("LinearOp
 LinearOperation::~LinearOperation() {}
 
 AsdOps::Status LinearOperation::InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                                           std::vector<AsdOps::TensorDesc> &outTensorDescs)
+                                           AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs)
 {
     // in * weight + bias
     // in[0,1] + weight[1]
@@ -50,8 +50,8 @@ AsdOps::Status LinearOperation::InferShape(const AsdOps::SVector<AsdOps::Tensor>
     return AsdOps::Status::OkStatus();
 }
 
-bool LinearOperation::IsConsistent(const std::vector<AsdOps::TensorDesc> &inTensorDescs,
-                                   std::vector<AsdOps::TensorDesc> &outTensorDescs) const
+bool LinearOperation::IsConsistent(const AsdOps::SVector<AsdOps::TensorDesc> &inTensorDescs,
+                                   AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs) const
 {
     ASDOPS_CHECK_TRUE(inTensorDescs.size() == static_cast<size_t>(DIM_3), return false);
     ASDOPS_CHECK_TRUE(outTensorDescs.size() == static_cast<size_t>(DIM_1), return false);
diff --git a/core/ops/linear/linear_ops_runner.cpp b/core/ops/linear/linear_ops_runner.cpp
index a5d0424b..9c996272 100644
--- a/core/ops/linear/linear_ops_runner.cpp
+++ b/core/ops/linear/linear_ops_runner.cpp
@@ -28,7 +28,7 @@ LinearOpsRunner::LinearOpsRunner(LinearParam &param) : OpsRunner("LinearOpsRunne
 
 LinearOpsRunner::~LinearOpsRunner() {}
 
-AsdOps::Status LinearOpsRunner::Setup(VariantPack &variantPack)
+AsdOps::Status LinearOpsRunner::SetupKernelGraph(const VariantPack &variantPack)
 {
     VariantPack newVariantPack;
     ConvertNewVariantPack(variantPack, newVariantPack);
@@ -85,14 +85,14 @@ AsdOps::Status LinearOpsRunner::Setup(VariantPack &variantPack)
     addNode.inTensors = {&transdata2ResultTensor, &biasTensor};
     addNode.outTensors = {&resultTensor};
 
-    return OpsRunner::Setup(newVariantPack);
+    return AsdOps::Status::OkStatus();
 }
 
-AsdOps::Status LinearOpsRunner::Execute(Handle &handle, VariantPack &variantPack)
+AsdOps::Status LinearOpsRunner::ExecuteImpl(Handle &handle, VariantPack &variantPack)
 {
     VariantPack newVariantPack;
     ConvertNewVariantPack(variantPack, newVariantPack);
-    return OpsRunner::Execute(handle, newVariantPack);
+    return OpsRunner::ExecuteImpl(handle, newVariantPack);
 }
 
 void LinearOpsRunner::ConvertNewVariantPack(const VariantPack &variantPack, VariantPack &newVariantPack)
diff --git a/core/ops/linear/linear_ops_runner.h b/core/ops/linear/linear_ops_runner.h
index 37fa0a3c..6e4b86a0 100644
--- a/core/ops/linear/linear_ops_runner.h
+++ b/core/ops/linear/linear_ops_runner.h
@@ -23,8 +23,10 @@ class LinearOpsRunner : public OpsRunner {
 public:
     LinearOpsRunner(LinearParam &param);
     virtual ~LinearOpsRunner();
-    AsdOps::Status Setup(VariantPack &variantPack) override;
-    AsdOps::Status Execute(Handle &handle, VariantPack &variantPack) override;
+
+protected:
+    AsdOps::Status SetupKernelGraph(const VariantPack &variantPack) override;
+    AsdOps::Status ExecuteImpl(Handle &handle, VariantPack &variantPack) override;
 
 private:
     void ConvertNewVariantPack(const VariantPack &variantPack, VariantPack &newVariantPack);
diff --git a/core/ops/linear/linear_torch_runner.cpp b/core/ops/linear/linear_torch_runner.cpp
index 122b5bf9..50e0b7ca 100644
--- a/core/ops/linear/linear_torch_runner.cpp
+++ b/core/ops/linear/linear_torch_runner.cpp
@@ -28,20 +28,12 @@ LinearTorchRunner::LinearTorchRunner(LinearParam &param) : Runner("LinearTorchRu
 
 LinearTorchRunner::~LinearTorchRunner() {}
 
-AsdOps::Status LinearTorchRunner::Execute(Handle &handle, VariantPack &variantPack)
+AsdOps::Status LinearTorchRunner::ExecuteImpl(Handle &handle, VariantPack &variantPack)
 {
     if (variantPack.inTensors.size() != 3) {
         return AsdOps::Status::FailStatus(1, "LinearTorchRunner inTensor num error!");
     }
 
-    // at::Tensor atInTensorA = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[0]);
-    // at::Tensor atInTensorWeight = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[1]);
-    // at::Tensor atInTensorWeightias = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[2]);
-    // int ret = AsdRtMemCopyAsync(variantPack.outTensors[0].data, variantPack.outTensors[0].dataSize,
-    //                             outputTensor.storage().data_ptr().get(), variantPack.outTensors[0].dataSize,
-    //                             ASDRT_MEMCOPY_DEVICE_TO_DEVICE, handle.stream);
-    // ASD_LOG_IF(ret != 0, ERROR) << GetName() << " AsdRtMemCopy fail";
-
     at::Tensor *atInTensorA =
         AsdOps::GetSingleton<AclTransformer::TensorCache>().GetTensor(variantPack.inTensors[0].data);
     at::Tensor *atInTensorWeight =
@@ -52,7 +44,6 @@ AsdOps::Status LinearTorchRunner::Execute(Handle &handle, VariantPack &variantPa
         AsdOps::GetSingleton<AclTransformer::TensorCache>().GetTensor(variantPack.outTensors[0].data);
     at::Tensor outputTensor = at::linear(*atInTensorA, *atInTensorWeight, *atInTensorWeightias).contiguous();
     *atResult = outputTensor;
-    ASD_LOG(INFO) << GetName() << " use cache tensor";
 
     return AsdOps::Status::OkStatus();
 }
diff --git a/core/ops/linear/linear_torch_runner.h b/core/ops/linear/linear_torch_runner.h
index af1f3227..b21ad7fe 100644
--- a/core/ops/linear/linear_torch_runner.h
+++ b/core/ops/linear/linear_torch_runner.h
@@ -23,7 +23,9 @@ class LinearTorchRunner : public Runner {
 public:
     LinearTorchRunner(LinearParam &param);
     virtual ~LinearTorchRunner();
-    AsdOps::Status Execute(Handle &handle, VariantPack &variantPack) override;
+
+protected:
+    AsdOps::Status ExecuteImpl(Handle &handle, VariantPack &variantPack) override;
 
 private:
     LinearParam param_;
diff --git a/core/ops/self_attention/self_attention_operation.cpp b/core/ops/self_attention/self_attention_operation.cpp
index 419e2540..29fade09 100644
--- a/core/ops/self_attention/self_attention_operation.cpp
+++ b/core/ops/self_attention/self_attention_operation.cpp
@@ -27,7 +27,7 @@ SelfAttentionOperation::SelfAttentionOperation(const SelfAttentionParam &param)
 SelfAttentionOperation::~SelfAttentionOperation() {}
 
 AsdOps::Status SelfAttentionOperation::InferShape(const AsdOps::SVector<AsdOps::Tensor> &inTensors,
-                                                  std::vector<AsdOps::TensorDesc> &outTensorDescs)
+                                                  AsdOps::SVector<AsdOps::TensorDesc> &outTensorDescs)
 {
     if (inTensors.size() != 4) {
         return AsdOps::Status::FailStatus(1, "inTensorDescs size is not 2");
diff --git a/core/ops/self_attention/self_attention_ops_runner.cpp b/core/ops/self_attention/self_attention_ops_runner.cpp
index 60061cb2..7337a4d4 100644
--- a/core/ops/self_attention/self_attention_ops_runner.cpp
+++ b/core/ops/self_attention/self_attention_ops_runner.cpp
@@ -26,5 +26,10 @@ SelfAttentionOpsRunner::SelfAttentionOpsRunner(const SelfAttentionParam &param)
     ASD_LOG(INFO) << "SelfAttentionOperation::SelfAttentionOperation called";
 }
 
+AsdOps::Status SelfAttentionOpsRunner::SetupKernelGraph(const VariantPack &variantPack)
+{
+    return AsdOps::Status::OkStatus();
+}
+
 SelfAttentionOpsRunner::~SelfAttentionOpsRunner() {}
 } // namespace AclTransformer
diff --git a/core/ops/self_attention/self_attention_ops_runner.h b/core/ops/self_attention/self_attention_ops_runner.h
index 91be308a..831f098a 100644
--- a/core/ops/self_attention/self_attention_ops_runner.h
+++ b/core/ops/self_attention/self_attention_ops_runner.h
@@ -24,6 +24,9 @@ public:
     SelfAttentionOpsRunner(const SelfAttentionParam &param);
     virtual ~SelfAttentionOpsRunner();
 
+protected:
+    AsdOps::Status SetupKernelGraph(const VariantPack &variantPack) override;
+
 private:
     SelfAttentionParam param_;
 };
diff --git a/core/ops/self_attention/self_attention_torch_runner.cpp b/core/ops/self_attention/self_attention_torch_runner.cpp
index ac3884ee..5db00ddc 100644
--- a/core/ops/self_attention/self_attention_torch_runner.cpp
+++ b/core/ops/self_attention/self_attention_torch_runner.cpp
@@ -19,6 +19,7 @@
 #include <asdops/utils/rt/rt.h>
 #include <ATen/ATen.h>
 #include <cmath>
+#include "acltransformer/utils/tensor_cache.h"
 
 namespace AclTransformer {
 SelfAttentionTorchRunner::SelfAttentionTorchRunner(const SelfAttentionParam &param)
@@ -29,16 +30,16 @@ SelfAttentionTorchRunner::SelfAttentionTorchRunner(const SelfAttentionParam &par
 
 SelfAttentionTorchRunner::~SelfAttentionTorchRunner() {}
 
-AsdOps::Status SelfAttentionTorchRunner::Execute(Handle &handle, VariantPack &variantPack)
+AsdOps::Status SelfAttentionTorchRunner::ExecuteImpl(Handle &handle, VariantPack &variantPack)
 {
     // 384, 32, 1024 -> 384, 32, 1024
     ASD_LOG(INFO) << "headNum:" << this->param_.headNum << "   dk:" << this->param_.dk;
-    torch::Tensor mixedQuery = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[0]);
+    torch::Tensor mixedQuery = *AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[0].data);
     mixedQuery = mixedQuery.view({mixedQuery.sizes()[0], mixedQuery.sizes()[1] * this->param_.headNum,
                                   mixedQuery.sizes()[2] / this->param_.headNum});
     mixedQuery = torch::transpose(mixedQuery, 0, 1);
-    torch::Tensor mixedKey = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[1]);
-    torch::Tensor mixedValue = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[2]);
+    torch::Tensor mixedKey = *AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[1].data);
+    torch::Tensor mixedValue = *AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[2].data);
     mixedValue = mixedValue.view({mixedValue.sizes()[0], mixedValue.sizes()[1] * this->param_.headNum,
                                   mixedValue.sizes()[2] / this->param_.headNum});
     mixedValue = torch::transpose(mixedValue, 0, 1);
@@ -46,7 +47,7 @@ AsdOps::Status SelfAttentionTorchRunner::Execute(Handle &handle, VariantPack &va
         {mixedKey.sizes()[0], mixedKey.sizes()[1] * this->param_.headNum, mixedKey.sizes()[2] / this->param_.headNum});
     mixedKey = mixedKey.permute({1, 2, 0});
 
-    torch::Tensor attention_mask = AsdOpsTensor2AtTensor(handle, variantPack.inTensors[3]);
+    torch::Tensor attention_mask = *AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.inTensors[3].data);
 
     double scal = 1 / sqrt(this->param_.dk);
     torch::Tensor attentionScores = torch::bmm(mixedQuery, mixedKey).contiguous();
@@ -65,10 +66,9 @@ AsdOps::Status SelfAttentionTorchRunner::Execute(Handle &handle, VariantPack &va
                               contextLayer.sizes()[2] * this->param_.headNum})
                        .contiguous();
 
-    int ret = AsdRtMemCopyAsync(variantPack.outTensors[0].data, variantPack.outTensors[0].dataSize,
-                                contextLayer.storage().data_ptr().get(), variantPack.outTensors[0].dataSize,
-                                ASDRT_MEMCOPY_DEVICE_TO_DEVICE, handle.stream);
-    ASD_LOG_IF(ret != 0, ERROR) << "AsdRtMemCopy fail";
+    torch::Tensor *atOutTensor = AsdOps::GetSingleton<TensorCache>().GetTensor(variantPack.outTensors[0].data);
+    *atOutTensor = contextLayer;
+
     return AsdOps::Status::OkStatus();
 }
 } // namespace AclTransformer
\ No newline at end of file
diff --git a/core/ops/self_attention/self_attention_torch_runner.h b/core/ops/self_attention/self_attention_torch_runner.h
index 24735920..4e675d22 100644
--- a/core/ops/self_attention/self_attention_torch_runner.h
+++ b/core/ops/self_attention/self_attention_torch_runner.h
@@ -23,7 +23,9 @@ class SelfAttentionTorchRunner : public Runner {
 public:
     SelfAttentionTorchRunner(const SelfAttentionParam &param);
     virtual ~SelfAttentionTorchRunner();
-    AsdOps::Status Execute(Handle &handle, VariantPack &variantPack) override;
+
+protected:
+    AsdOps::Status ExecuteImpl(Handle &handle, VariantPack &variantPack) override;
 
 private:
     SelfAttentionParam param_;
diff --git a/core/utils/tensor_util.cpp b/core/utils/tensor_util.cpp
index 17e5d4c5..9050f462 100644
--- a/core/utils/tensor_util.cpp
+++ b/core/utils/tensor_util.cpp
@@ -22,7 +22,7 @@
 #include "acltransformer/utils/tensor_cache.h"
 
 namespace AclTransformer {
-void GetTensorDescs(const std::vector<AsdOps::Tensor> &tensors, std::vector<AsdOps::TensorDesc> &tensorDescs)
+void GetTensorDescs(const std::vector<AsdOps::Tensor> &tensors, AsdOps::SVector<AsdOps::TensorDesc> &tensorDescs)
 {
     tensorDescs.resize(tensors.size());
     for (size_t i = 0; i < tensors.size(); ++i) {
@@ -30,16 +30,27 @@ void GetTensorDescs(const std::vector<AsdOps::Tensor> &tensors, std::vector<AsdO
     }
 }
 
-uint64_t CalcTensorDataSize(const AsdOps::Tensor &tensor)
+uint64_t CalcTensorDataSize(const AsdOps::Tensor &tensor) { return CalcTensorDataSize(tensor.desc); }
+
+uint64_t CalcTensorDataSize(const AsdOps::TensorDesc &tensorDesc)
 {
+    if (tensorDesc.dims.size() == 0) {
+        return 0;
+    }
+
     uint64_t dataItemSize = 0;
-    switch (tensor.desc.dtype) {
+    switch (tensorDesc.dtype) {
     case AsdOps::TENSOR_DTYPE_FLOAT: dataItemSize = sizeof(float); break;
     case AsdOps::TENSOR_DTYPE_FLOAT16: dataItemSize = 2; break;
-    default: ASD_LOG(ERROR) << "not support dtype:" << tensor.desc.dtype;
+    default: ASD_LOG(ERROR) << "not support dtype:" << tensorDesc.dtype;
+    }
+
+    int64_t elementCount = 1;
+    for (auto i : tensorDesc.dims) {
+        elementCount *= i;
     }
 
-    return dataItemSize * tensor.Numel();
+    return dataItemSize * elementCount;
 }
 
 static at::IntArrayRef IntArrayRef(const AsdOps::SVector<int64_t> &src)
diff --git a/examples/torch/ops/operation/operation_creator.cpp b/examples/torch/operation/operation_creator.cpp
similarity index 68%
rename from examples/torch/ops/operation/operation_creator.cpp
rename to examples/torch/operation/operation_creator.cpp
index 472cc2fb..2428c3e0 100644
--- a/examples/torch/ops/operation/operation_creator.cpp
+++ b/examples/torch/operation/operation_creator.cpp
@@ -20,6 +20,8 @@
 #include "acltransformer/ops/add_operation.h"
 #include "acltransformer/ops/add_norm_operation.h"
 #include "acltransformer/ops/linear_operation.h"
+#include "acltransformer/ops/ffn_operation.h"
+#include "acltransformer/ops/self_attention_operation.h"
 
 using OperationCreateFunc = std::function<AclTransformer::Operation *(const Json::Value &paramJson)>;
 
@@ -39,15 +41,32 @@ AclTransformer::Operation *AddNormOperationCreate(const Json::Value &paramJson)
 
 AclTransformer::Operation *LinearOperationCreate(const Json::Value &paramJson)
 {
-    AclTransformer::LinearParam linearParam;
-    linearParam.transposeA = paramJson["transposeA"].asBool();
-    linearParam.transposeB = paramJson["transposeB"].asBool();
-    return new AclTransformer::LinearOperation(linearParam);
+    AclTransformer::LinearParam param;
+    param.transposeA = paramJson["transposeA"].asBool();
+    param.transposeB = paramJson["transposeB"].asBool();
+    return new AclTransformer::LinearOperation(param);
+}
+
+AclTransformer::Operation *FfnOperationCreate(const Json::Value &paramJson)
+{
+    AclTransformer::FfnParam param;
+    return new AclTransformer::FfnOperation(param);
+}
+
+AclTransformer::Operation *SelfAttentionOperationCreate(const Json::Value &paramJson)
+{
+    AclTransformer::SelfAttentionParam param;
+    param.transKey = paramJson["transKey"].asBool();
+    param.dk = paramJson["dk"].asInt();
+    param.headNum = paramJson["headNum"].asInt();
+    return new AclTransformer::SelfAttentionOperation(param);
 }
 
 std::map<std::string, OperationCreateFunc> g_funcMap = {{"AddOperation", &AddOperationCreate},
                                                         {"AddNormOperation", &AddNormOperationCreate},
-                                                        {"LinearOperation", &LinearOperationCreate}};
+                                                        {"LinearOperation", &LinearOperationCreate},
+                                                        {"FfnOperation", &FfnOperationCreate},
+                                                        {"SelfAttentionOperation", &SelfAttentionOperationCreate}};
 
 AclTransformer::Operation *CreateOperation(const std::string &opName, const std::string &param)
 {
diff --git a/examples/torch/ops/operation/operation_creator.h b/examples/torch/operation/operation_creator.h
similarity index 100%
rename from examples/torch/ops/operation/operation_creator.h
rename to examples/torch/operation/operation_creator.h
diff --git a/examples/torch/ops/operation/operation_torch.cpp b/examples/torch/operation/operation_torch.cpp
similarity index 58%
rename from examples/torch/ops/operation/operation_torch.cpp
rename to examples/torch/operation/operation_torch.cpp
index 84d523cc..86c991f4 100644
--- a/examples/torch/ops/operation/operation_torch.cpp
+++ b/examples/torch/operation/operation_torch.cpp
@@ -17,6 +17,7 @@
 #include <asdops/utils/log/log.h>
 #include <asdops/utils/rt/rt.h>
 #include "acltransformer/utils/tensor_util.h"
+#include "acltransformer/utils/tensor_cache.h"
 #include "acltransformer/config.h"
 #include "examples/utils/example_utils.h"
 #include "operation_creator.h"
@@ -27,29 +28,42 @@ OperationTorch::~OperationTorch() {}
 
 void OperationTorch::Test() { ASD_LOG(INFO) << "OperationTorch::Test called"; }
 
-void OperationTorch::Execute(std::string opName, std::string param, std::vector<torch::Tensor> atInTensors,
-                             std::vector<torch::Tensor> atOutTensors)
+std::vector<torch::Tensor> OperationTorch::Execute(std::string opName, std::string param,
+                                                   std::vector<torch::Tensor> atInTensors)
 {
+    for (auto &inTensor : atInTensors) {
+        inTensor = inTensor.contiguous();
+    }
+
+    std::vector<torch::Tensor> atOutTensors;
+
     AclTransformer::Operation *operation = CreateOperation(opName, param);
     if (operation == nullptr) {
         ASD_LOG(ERROR) << "create operation fail, json:" << param;
-        return;
+        return atOutTensors;
     }
 
-        delete operation;
+    ExecuteOperation(operation, atInTensors, atOutTensors);
+
+    delete operation;
+    return atOutTensors;
 }
 
-void OperationTorch::ExecuteOperation(AclTransformer::Operation *operation, std::vector<torch::Tensor> atInTensors,
-                                      std::vector<torch::Tensor> atOutTensors)
+void OperationTorch::ExecuteOperation(AclTransformer::Operation *operation, std::vector<torch::Tensor> &atInTensors,
+                                      std::vector<torch::Tensor> &atOutTensors)
 {
     AclTransformer::Handle handle = {GetCurrentStream()};
     AclTransformer::VariantPack variantPack;
     for (size_t i = 0; i < atInTensors.size(); ++i) {
-        atInTensors.at(i) = atInTensors.at(i).contiguous();
         variantPack.inTensors.push_back(AtTensor2AsdTensor(atInTensors.at(i)));
+        AsdOps::GetSingleton<AclTransformer::TensorCache>().AddTensor(atInTensors.at(i).data_ptr(), &atInTensors.at(i));
     }
+
+    CreateAtOutTensors(operation, variantPack.inTensors, atOutTensors);
     for (size_t i = 0; i < atOutTensors.size(); ++i) {
         variantPack.outTensors.push_back(AtTensor2AsdTensor(atOutTensors.at(i)));
+        AsdOps::GetSingleton<AclTransformer::TensorCache>().AddTensor(atOutTensors.at(i).data_ptr(),
+                                                                      &atOutTensors.at(i));
     }
 
     AsdOps::Status st = operation->Setup(variantPack);
@@ -60,6 +74,7 @@ void OperationTorch::ExecuteOperation(AclTransformer::Operation *operation, std:
 
     variantPack.workspaceSize = operation->GetWorkspaceSize();
     ASD_LOG(ERROR) << operation->GetName() << " GetWorkspaceSize:" << variantPack.workspaceSize;
+
     if (variantPack.workspaceSize > 0) {
         int st = AsdRtMemMallocDevice((void **)&variantPack.workspace, variantPack.workspaceSize, ASDRT_MEM_DEFAULT);
         if (st != ASDRT_SUCCESS) {
@@ -84,6 +99,39 @@ void OperationTorch::ExecuteOperation(AclTransformer::Operation *operation, std:
         variantPack.workspace = nullptr;
         variantPack.workspaceSize = 0;
     }
+
+    for (size_t i = 0; i < atInTensors.size(); ++i) {
+        AsdOps::GetSingleton<AclTransformer::TensorCache>().DeleteTensor(atInTensors.at(i).data_ptr());
+    }
+    for (size_t i = 0; i < atOutTensors.size(); ++i) {
+        AsdOps::GetSingleton<AclTransformer::TensorCache>().DeleteTensor(atOutTensors.at(i).data_ptr());
+    }
+}
+
+void OperationTorch::CreateAtOutTensors(AclTransformer::Operation *operation,
+                                        const AsdOps::SVector<AsdOps::Tensor> &inTensors,
+                                        std::vector<torch::Tensor> &atOutTensors)
+{
+    AsdOps::SVector<AsdOps::TensorDesc> outTensorDescs;
+    operation->InferShape(inTensors, outTensorDescs);
+
+    atOutTensors.resize(outTensorDescs.size());
+    for (size_t i = 0; i < outTensorDescs.size(); ++i) {
+        at::TensorOptions options = at::TensorOptions();
+        if (outTensorDescs.at(i).dtype == AsdOps::TENSOR_DTYPE_FLOAT) {
+            options = options.dtype(at::kFloat);
+        } else if (outTensorDescs.at(i).dtype == AsdOps::TENSOR_DTYPE_FLOAT16) {
+            options = options.dtype(at::kHalf);
+        }
+        at::Tensor newTensor =
+            at::zeros(at::IntArrayRef(outTensorDescs.at(i).dims.data(), outTensorDescs.at(i).dims.size()), options);
+#ifdef TORCH_18
+        newTensor = newTensor.to(at::Device(at::DeviceType::XLA));
+#else
+        newTensor = newTensor.to(at::Device(at::kPrivateUse1));
+#endif
+        atOutTensors.at(i) = newTensor.contiguous();
+    }
 }
 
 TORCH_LIBRARY(OperationTorch, m)
diff --git a/examples/torch/ops/operation/operation_torch.h b/examples/torch/operation/operation_torch.h
similarity index 72%
rename from examples/torch/ops/operation/operation_torch.h
rename to examples/torch/operation/operation_torch.h
index 36fd3130..57758fda 100644
--- a/examples/torch/ops/operation/operation_torch.h
+++ b/examples/torch/operation/operation_torch.h
@@ -25,13 +25,14 @@ public:
     OperationTorch();
     ~OperationTorch();
     void Test();
-    void Execute(std::string opName, std::string param, std::vector<torch::Tensor> inTensors,
-                 std::vector<torch::Tensor> outTensors);
+    std::vector<torch::Tensor> Execute(std::string opName, std::string param, std::vector<torch::Tensor> inTensors);
     c10::intrusive_ptr<OperationTorch> clone() const { return c10::make_intrusive<OperationTorch>(); }
 
 private:
-    void ExecuteOperation(AclTransformer::Operation *operation, std::vector<torch::Tensor> atInTensors,
-                          std::vector<torch::Tensor> atOutTensors);
+    void ExecuteOperation(AclTransformer::Operation *operation, std::vector<torch::Tensor> &atInTensors,
+                          std::vector<torch::Tensor> &atOutTensors);
+    void CreateAtOutTensors(AclTransformer::Operation *operation, const AsdOps::SVector<AsdOps::Tensor> &inTensors,
+                            std::vector<torch::Tensor> &atOutTensors);
 };
 
 #endif
\ No newline at end of file
diff --git a/examples/torch/ops/operation/test_add_norm_operation_rand.py b/examples/torch/operation/test_add_norm_operation.py
similarity index 81%
rename from examples/torch/ops/operation/test_add_norm_operation_rand.py
rename to examples/torch/operation/test_add_norm_operation.py
index eaca7e6b..f43eb412 100644
--- a/examples/torch/ops/operation/test_add_norm_operation_rand.py
+++ b/examples/torch/operation/test_add_norm_operation.py
@@ -32,24 +32,23 @@ class TestAddNormal(unittest.TestCase):
     def test_2d(self):
         operation = torch.classes.OperationTorch.OperationTorch()
         operation.test()
-        a = torch.rand(2, 3).npu()
-        b = torch.rand(2, 3).npu()
-        normWeight = torch.rand(3).npu()
-        normBias = torch.rand(3).npu()
+        a = torch.rand(2, 3).npu().half()
+        b = torch.rand(2, 3).npu().half()
+        normWeight = torch.rand(3).npu().half()
+        normBias = torch.rand(3).npu().half()
 
-        result = torch.zeros(2, 3).npu()
-        operation.execute("AddNormOperation", json.dumps(
-            {"layerNormEps": 1e-12}), [a, b, normWeight, normBias], [result])
+        results = operation.execute("AddNormOperation", json.dumps(
+            {"layerNormEps": 1e-12}), [a, b, normWeight, normBias])
 
         layer_norm = torch.nn.LayerNorm([3]).npu()
         layer_norm.load_state_dict({"weight": normWeight, "bias": normBias})
         golden_result = layer_norm(a + b)
 
-        print("result:" + str(result))
+        print("result:" + str(results[0]))
         print("golden_result:" + str(golden_result))
 
         self.assertTrue(torch.allclose(
-            result, golden_result, rtol=0.02, atol=0.02))
+            results[0], golden_result, rtol=0.02, atol=0.02))
 
 
 if __name__ == '__main__':
diff --git a/examples/torch/ops/add_norm/test_add_norm_operation_torch.py b/examples/torch/operation/test_add_norm_operation_data.py
similarity index 100%
rename from examples/torch/ops/add_norm/test_add_norm_operation_torch.py
rename to examples/torch/operation/test_add_norm_operation_data.py
diff --git a/examples/torch/ops/add/test_add_operation_torch.py b/examples/torch/operation/test_add_operation.py
similarity index 72%
rename from examples/torch/ops/add/test_add_operation_torch.py
rename to examples/torch/operation/test_add_operation.py
index ab4e82b8..18a2bf20 100644
--- a/examples/torch/ops/add/test_add_operation_torch.py
+++ b/examples/torch/operation/test_add_operation.py
@@ -30,18 +30,20 @@ torch.classes.load_library(LIB_PATH)
 
 class TestNormal(unittest.TestCase):
     def test_2d(self):
-        operation = torch.classes.AddOperationTorch.AddOperationTorch()
+        operation = torch.classes.OperationTorch.OperationTorch()
         operation.test()
-        a = torch.rand(2, 3).npu()
-        b = torch.rand(2, 3).npu()
-        print("a:" + str(a))
-        print("b:" + str(b))
-        c = operation.execute(a, b)
-        golden_c = a + b
-        print("c:" + str(c))
-        print("golden_c:" + str(golden_c))
-
-        self.assertTrue(torch.allclose(c, golden_c, rtol=0.02, atol=0.02))
+        a = torch.rand(2, 3).npu().half()
+        b = torch.rand(2, 3).npu().half()
+
+        results = operation.execute("AddOperation", "{\"scale\": 1}", [a, b])
+
+        golden_result = a + b
+
+        print("results:", results[0])
+        print("golden_result:", str(golden_result))
+
+        self.assertTrue(torch.allclose(
+            results[0], golden_result, rtol=0.02, atol=0.02))
 
 
 if __name__ == '__main__':
diff --git a/examples/torch/ops/operation/test_linear_operation_torch_rand.py b/examples/torch/operation/test_linear_operation.py
similarity index 75%
rename from examples/torch/ops/operation/test_linear_operation_torch_rand.py
rename to examples/torch/operation/test_linear_operation.py
index bfb76290..cb0ffbb1 100644
--- a/examples/torch/ops/operation/test_linear_operation_torch_rand.py
+++ b/examples/torch/operation/test_linear_operation.py
@@ -36,20 +36,13 @@ class TestNormal(unittest.TestCase):
         b = torch.rand(1024, 1024).npu().half()
         c = torch.rand(1024).npu().half()
 
-        if len(a.size()) == 3:
-            result = torch.zeros(a.size()[0], a.size()[
-                                 1], b.size()[0]).npu().half()
-        else:
-            result = torch.zeros(
-                {a.size()[0], b.size()[0]}, a.options()).npu().half()
-        print(result.size())
-        operation.execute("LinearOperation", '{"transposeA":false, "transposeB":true}', [
-                          a, b, c], [result])
+        results = operation.execute("LinearOperation", '{"transposeA":false, "transposeB":true}', [
+            a, b, c])
 
         golden_result = torch.matmul(a, torch.transpose(b, 0, 1)) + c
 
         self.assertTrue(torch.allclose(
-            result, golden_result, rtol=0.02, atol=0.02))
+            results[0], golden_result, rtol=0.02, atol=0.02))
 
 
 if __name__ == '__main__':
diff --git a/examples/torch/ops/linear/test_linear_operation_torch.py b/examples/torch/operation/test_linear_operation_data.py
similarity index 100%
rename from examples/torch/ops/linear/test_linear_operation_torch.py
rename to examples/torch/operation/test_linear_operation_data.py
diff --git a/examples/torch/ops/add/add_operation_torch.cpp b/examples/torch/ops/add/add_operation_torch.cpp
deleted file mode 100644
index f82b25fe..00000000
--- a/examples/torch/ops/add/add_operation_torch.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "add_operation_torch.h"
-#include <asdops/utils/log/log.h>
-#include "acltransformer/ops/add_operation.h"
-#include "examples/utils/example_utils.h"
-
-AddOperationTorch::AddOperationTorch()
-{
-    ASD_LOG(INFO) << "AddOperationTorch::AddOperationTorch";
-    AclTransformer::AddParam param;
-    operation_ = new AclTransformer::AddOperation(param);
-}
-
-AddOperationTorch::~AddOperationTorch()
-{
-    if (operation_) {
-        delete operation_;
-        operation_ = nullptr;
-    }
-}
-
-void AddOperationTorch::Test() { ASD_LOG(INFO) << "AddOperationTorch::Test called"; }
-
-torch::Tensor AddOperationTorch::Execute(torch::Tensor a, torch::Tensor b)
-{
-    ASD_LOG(INFO) << "AddOperationTorch::Execute start";
-    torch::Tensor resultTensor = at::zeros(a.sizes(), a.options());
-    ExecuteOperation(operation_, {&a, &b}, {&resultTensor});
-    ASD_LOG(INFO) << "AddOperationTorch::Execute end";
-    return resultTensor;
-}
-
-TORCH_LIBRARY(AddOperationTorch, m)
-{
-    m.class_<AddOperationTorch>("AddOperationTorch")
-        .def(torch::init<>())
-        .def("test", &AddOperationTorch::Test)
-        .def("execute", &AddOperationTorch::Execute);
-}
\ No newline at end of file
diff --git a/examples/torch/ops/add/add_operation_torch.h b/examples/torch/ops/add/add_operation_torch.h
deleted file mode 100644
index 2bef193a..00000000
--- a/examples/torch/ops/add/add_operation_torch.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef ADD_OPERATION_TORCH_H
-#define ADD_OPERATION_TORCH_H
-#include <torch/script.h>
-#include <torch/custom_class.h>
-
-namespace AclTransformer {
-class AddOperation;
-}
-
-class AddOperationTorch : public torch::CustomClassHolder {
-public:
-    AddOperationTorch();
-    ~AddOperationTorch();
-    void Test();
-    torch::Tensor Execute(torch::Tensor a, torch::Tensor b);
-    c10::intrusive_ptr<AddOperationTorch> clone() const { return c10::make_intrusive<AddOperationTorch>(); }
-
-private:
-    AclTransformer::AddOperation *operation_ = nullptr;
-};
-
-#endif
\ No newline at end of file
diff --git a/examples/torch/ops/add_norm/add_norm_operation_torch.cpp b/examples/torch/ops/add_norm/add_norm_operation_torch.cpp
deleted file mode 100644
index 358f7f21..00000000
--- a/examples/torch/ops/add_norm/add_norm_operation_torch.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "add_norm_operation_torch.h"
-#include <asdops/utils/log/log.h>
-#include "acltransformer/ops/add_norm_operation.h"
-#include "examples/utils/example_utils.h"
-
-AddNormOperationTorch::AddNormOperationTorch()
-{
-    ASD_LOG(INFO) << "AddNormOperationTorch::AddNormOperationTorch";
-    AclTransformer::AddNormParam param;
-    operation_ = new AclTransformer::AddNormOperation(param);
-}
-
-AddNormOperationTorch::~AddNormOperationTorch()
-{
-    if (operation_) {
-        delete operation_;
-        operation_ = nullptr;
-    }
-}
-
-void AddNormOperationTorch::Test() { ASD_LOG(INFO) << "AddNormOperationTorch::Test called"; }
-
-torch::Tensor AddNormOperationTorch::Execute(torch::Tensor a, torch::Tensor b, torch::Tensor normWeight,
-                                             torch::Tensor normBias)
-{
-    a = a.contiguous();
-    b = b.contiguous();
-    normWeight = normWeight.contiguous();
-    normBias = normBias.contiguous();
-    ASD_LOG(INFO) << "AddNormOperationTorch::Execute start, a.device.type:" << a.device().type();
-    torch::Tensor resultTensor = at::zeros(a.sizes(), a.options()).contiguous();
-    ExecuteOperation(operation_, {&a, &b, &normWeight, &normBias}, {&resultTensor});
-    ASD_LOG(INFO) << "AddNormOperationTorch::Execute end";
-    return resultTensor;
-    // at::Tensor addResultTensor = at::add(a, b);
-    // return at::layer_norm(addResultTensor, normWeight.sizes(), normWeight, normBias, 1e-12);
-}
-
-TORCH_LIBRARY(AddNormOperationTorch, m)
-{
-    m.class_<AddNormOperationTorch>("AddNormOperationTorch")
-        .def(torch::init<>())
-        .def("test", &AddNormOperationTorch::Test)
-        .def("execute", &AddNormOperationTorch::Execute);
-}
\ No newline at end of file
diff --git a/examples/torch/ops/add_norm/add_norm_operation_torch.h b/examples/torch/ops/add_norm/add_norm_operation_torch.h
deleted file mode 100644
index a3ab19bc..00000000
--- a/examples/torch/ops/add_norm/add_norm_operation_torch.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef ADD_NORM_OPERATION_TORCH_H
-#define ADD_NORM_OPERATION_TORCH_H
-#include <torch/script.h>
-#include <torch/custom_class.h>
-
-namespace AclTransformer {
-class AddNormOperation;
-}
-
-class AddNormOperationTorch : public torch::CustomClassHolder {
-public:
-    AddNormOperationTorch();
-    ~AddNormOperationTorch();
-    void Test();
-    torch::Tensor Execute(torch::Tensor a, torch::Tensor b, torch::Tensor normWeight, torch::Tensor normBias);
-    c10::intrusive_ptr<AddNormOperationTorch> clone() const { return c10::make_intrusive<AddNormOperationTorch>(); }
-
-private:
-    AclTransformer::AddNormOperation *operation_ = nullptr;
-};
-
-#endif
\ No newline at end of file
diff --git a/examples/torch/ops/add_norm/test_add_norm_operation_rand.py b/examples/torch/ops/add_norm/test_add_norm_operation_rand.py
deleted file mode 100644
index 283c0b48..00000000
--- a/examples/torch/ops/add_norm/test_add_norm_operation_rand.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import os
-import json
-import torch
-import torch_npu
-
-
-ACLTRANSFORMER_HOME_PATH = os.environ.get("ACLTRANSFORMER_HOME_PATH")
-if ACLTRANSFORMER_HOME_PATH is None:
-    raise RuntimeError(
-        "env ACLTRANSFORMER_HOME_PATH not exist, source set_env.sh")
-
-LIB_PATH = os.path.join(ACLTRANSFORMER_HOME_PATH,
-                        "examples/libacltransformer_torch.so")
-torch.classes.load_library(LIB_PATH)
-
-
-class TestAddNormal(unittest.TestCase):
-    def test_2d(self):
-        operation = torch.classes.AddNormOperationTorch.AddNormOperationTorch()
-        operation.test()
-        a = torch.rand(2, 3).npu()
-        b = torch.rand(2, 3).npu()
-        normWeight = torch.rand(3).npu()
-        normBias = torch.rand(3).npu()
-        print("a:" + str(a))
-        print("b:" + str(b))
-        print("normWeight:" + str(normWeight))
-        print("normBias:" + str(normBias))
-        c = operation.execute(a, b, normWeight, normBias)
-        layer_norm = torch.nn.LayerNorm([3]).npu()
-        layer_norm.load_state_dict({"weight": normWeight, "bias": normBias})
-
-        golden_c = layer_norm(a + b)
-        print("c:" + str(c))
-        print("golden_c:" + str(golden_c))
-
-        self.assertTrue(torch.allclose(c, golden_c, rtol=0.02, atol=0.02))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/examples/torch/ops/ffn/ffn_operation_torch.cpp b/examples/torch/ops/ffn/ffn_operation_torch.cpp
deleted file mode 100644
index 94d184fb..00000000
--- a/examples/torch/ops/ffn/ffn_operation_torch.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "ffn_operation_torch.h"
-#include <asdops/utils/log/log.h>
-#include "acltransformer/ops/ffn_operation.h"
-#include "examples/utils/example_utils.h"
-
-FfnOperationTorch::FfnOperationTorch()
-{
-    AclTransformer::FfnParam param;
-    operation_ = new AclTransformer::FfnOperation(param);
-    ASD_LOG(INFO) << "FfnOperationTorch::FfnOperationTorch";
-}
-
-FfnOperationTorch::~FfnOperationTorch()
-{
-    if (operation_) {
-        delete operation_;
-        operation_ = nullptr;
-    }
-}
-
-void FfnOperationTorch::Test() { ASD_LOG(INFO) << "FfnOperationTorch::Test called"; }
-
-torch::Tensor FfnOperationTorch::Execute(torch::Tensor a, torch::Tensor b, torch::Tensor c)
-{
-    a = a.contiguous();
-    b = b.contiguous();
-    c = c.contiguous();
-    torch::Tensor resultTensor;
-    if (a.sizes().size() == 3) {
-        resultTensor = at::empty({a.sizes()[0], a.sizes()[1], b.sizes()[0]}, a.options()); // to do  shape
-    } else {
-        resultTensor = at::empty({a.sizes()[0], b.sizes()[0]}, a.options());
-    }
-    resultTensor = resultTensor.contiguous();
-
-    // at::Tensor outputTensor = at::linear(a, b, c);
-    // d = at::gelu(outputTensor);
-
-    ExecuteOperation(operation_, {&a, &b, &c}, {&resultTensor});
-    ASD_LOG(INFO) << "FfnOperationTorch::Execute end";
-    return resultTensor;
-}
-
-TORCH_LIBRARY(FfnOperationTorch, m)
-{
-    m.class_<FfnOperationTorch>("FfnOperationTorch")
-        .def(torch::init<>())
-        .def("test", &FfnOperationTorch::Test)
-        .def("execute", &FfnOperationTorch::Execute);
-}
\ No newline at end of file
diff --git a/examples/torch/ops/ffn/ffn_operation_torch.h b/examples/torch/ops/ffn/ffn_operation_torch.h
deleted file mode 100644
index 3cd05dae..00000000
--- a/examples/torch/ops/ffn/ffn_operation_torch.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef FFN_OPERATION_TORCH_H
-#define FFN_OPERATION_TORCH_H
-#include <torch/script.h>
-#include <torch/custom_class.h>
-#include <acltransformer/ops/ffn_operation.h>
-
-class FfnOperationTorch : public torch::CustomClassHolder {
-public:
-    FfnOperationTorch();
-    ~FfnOperationTorch();
-    void Test();
-    torch::Tensor Execute(torch::Tensor a, torch::Tensor b, torch::Tensor c);
-    c10::intrusive_ptr<FfnOperationTorch> clone() const { return c10::make_intrusive<FfnOperationTorch>(); }
-
-private:
-    AclTransformer::FfnOperation *operation_ = nullptr;
-};
-
-#endif
\ No newline at end of file
diff --git a/examples/torch/ops/ffn/test_ffn_operation_torch.py b/examples/torch/ops/ffn/test_ffn_operation_torch.py
deleted file mode 100644
index 9355f055..00000000
--- a/examples/torch/ops/ffn/test_ffn_operation_torch.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch_npu
-import torch
-import json
-import unittest
-
-import sys
-import os
-
-sys.path.append(os.path.abspath(
-    os.path.join(os.path.dirname(__file__), "../..")))
-from tensor_testcase import TensorTestCase
-
-ACLTRANSFORMER_HOME_PATH = os.environ.get("ACLTRANSFORMER_HOME_PATH")
-if ACLTRANSFORMER_HOME_PATH is None:
-    raise RuntimeError(
-        "env ACLTRANSFORMER_HOME_PATH not exist, source set_env.sh")
-
-LIB_PATH = os.path.join(ACLTRANSFORMER_HOME_PATH,
-                        "examples/libacltransformer_torch.so")
-torch.classes.load_library(LIB_PATH)
-
-
-# class TestNormal(unittest.TestCase):
-#     def test_2d(self):
-#         operation = torch.classes.FfnOperationTorch.FfnOperationTorch()
-#         operation.test()
-#         a = torch.rand(5, 5).npu()
-#         b = torch.rand(5, 5).npu()
-#         c = torch.rand(5, 5).npu()
-#         print("a:" + str(a))
-#         print("b:" + str(b))
-#         print("c:" + str(c))
-#         d = operation.execute(a, b, c)
-#         golden_d = torch.nn.functional.gelu(torch.matmul(a, torch.transpose(b, 0, 1)) + c)
-#         print("d:" + str(d))
-#         print("golden_d:" + str(golden_d))
-
-#         self.assertTrue(torch.allclose(d, golden_d, rtol=0.02, atol=0.02))
-
-
-class TestBert(unittest.TestCase):
-    def test_2d(self):
-        operation = torch.classes.FfnOperationTorch.FfnOperationTorch()
-        operation.test()
-        testcase = TensorTestCase('LinearActivation', in_tensor_num=3)
-        for i in range(1, 101):
-            testcase.read(i)
-            in_tensors = testcase.get_in_tensors()
-            out_tensors = testcase.get_out_tensors()
-            a = in_tensors[0].npu()
-            b = in_tensors[1].npu()
-            c = in_tensors[2].npu()
-            print(a.size())
-            print(b.size())
-            print(c.size())
-            golden_d = out_tensors[0].npu()
-            d = operation.execute(a, b, c)
-            # d = torch.nn.functional.gelu(torch.nn.functional.linear(a, b, c))
-            print("d:" + str(d))
-            print("golden_d:" + str(golden_d))
-            print("d:" + str(d.size()))
-            print("golden_d:" + str(golden_d.size()))
-
-            self.assertTrue(torch.allclose(d, golden_d, rtol=0.02, atol=0.02))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/examples/torch/ops/linear/linear_operation_torch.cpp b/examples/torch/ops/linear/linear_operation_torch.cpp
deleted file mode 100644
index 12874ae1..00000000
--- a/examples/torch/ops/linear/linear_operation_torch.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "linear_operation_torch.h"
-#include <torch/torch.h>
-#include <asdops/utils/log/log.h>
-#include "acltransformer/ops/linear_operation.h"
-#include "examples/utils/example_utils.h"
-#include <json/json.h>
-#include "acltransformer/utils/tensor_cache.h"
-
-LinearOperationTorch::LinearOperationTorch(std::string param) : param_(param)
-{
-    ASD_LOG(INFO) << "LinearOperationTorch::LinearOperationTorch";
-    Json::Reader paramReader;
-    Json::Value paramJson;
-    if (!paramReader.parse(param, paramJson)) {
-        ASD_LOG(ERROR) << "json parse error";
-    }
-    AclTransformer::LinearParam linearParam;
-    linearParam.transposeA = paramJson["transposeA"].asBool();
-    linearParam.transposeB = paramJson["transposeB"].asBool();
-    operation_ = new AclTransformer::LinearOperation(linearParam);
-}
-
-LinearOperationTorch::~LinearOperationTorch()
-{
-    if (operation_) {
-        delete operation_;
-        operation_ = nullptr;
-    }
-}
-
-void LinearOperationTorch::Test() { ASD_LOG(INFO) << "LinearOperationTorch::Test called"; }
-
-torch::Tensor LinearOperationTorch::Execute(torch::Tensor a, torch::Tensor b, torch::Tensor c)
-{
-    a = a.contiguous();
-    b = b.contiguous();
-    c = c.contiguous();
-    ASD_LOG(INFO) << "LinearOperationTorch::Execute start";
-    ASD_LOG(INFO) << "LinearOperationTorch inTensors[a].options:" << a.options() << ", data:" << a.data_ptr();
-    ASD_LOG(INFO) << "LinearOperationTorch inTensors[b].options:" << b.options() << ", data:" << b.data_ptr();
-    ASD_LOG(INFO) << "LinearOperationTorch inTensors[c].options:" << c.options() << ", data:" << c.data_ptr();
-
-    torch::save(b.to(at::Device(at::kCPU)).contiguous(), "b.pth");
-    ASD_LOG(INFO) << "LinearOperationTorch save b.pth";
-    torch::Tensor resultTensor;
-    if (a.sizes().size() == 3) {
-        resultTensor = at::zeros({a.sizes()[0], a.sizes()[1], b.sizes()[0]}, a.options()).contiguous();
-    } else {
-        resultTensor = at::zeros({a.sizes()[0], b.sizes()[0]}, a.options()).contiguous();
-    }
-    ExecuteOperation(operation_, {&a, &b, &c}, {&resultTensor});
-    ASD_LOG(INFO) << "LinearOperationTorch::Execute end";
-    return resultTensor;
-
-    // return at::linear(a, b, c);
-}
-
-TORCH_LIBRARY(LinearOperationTorch, m)
-{
-    m.class_<LinearOperationTorch>("LinearOperationTorch")
-        .def(torch::init<std::string>())
-        .def("test", &LinearOperationTorch::Test)
-        .def("execute", &LinearOperationTorch::Execute);
-}
\ No newline at end of file
diff --git a/examples/torch/ops/linear/linear_operation_torch.h b/examples/torch/ops/linear/linear_operation_torch.h
deleted file mode 100644
index f66d8543..00000000
--- a/examples/torch/ops/linear/linear_operation_torch.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef LINEAR_OPERATION_TORCH_H
-#define LINEAR_OPERATION_TORCH_H
-#include <torch/script.h>
-#include <torch/custom_class.h>
-#include <acltransformer/ops/linear_operation.h>
-
-class LinearOperationTorch : public torch::CustomClassHolder {
-public:
-    LinearOperationTorch(std::string param);
-    ~LinearOperationTorch();
-    void Test();
-    torch::Tensor Execute(torch::Tensor a, torch::Tensor b, torch::Tensor c);
-    c10::intrusive_ptr<LinearOperationTorch> clone() const { return c10::make_intrusive<LinearOperationTorch>(param_); }
-
-private:
-    AclTransformer::LinearOperation *operation_ = nullptr;
-    std::string param_;
-};
-
-#endif
\ No newline at end of file
diff --git a/examples/torch/ops/linear/test_linear_operation_torch_rand.py b/examples/torch/ops/linear/test_linear_operation_torch_rand.py
deleted file mode 100644
index 992aed30..00000000
--- a/examples/torch/ops/linear/test_linear_operation_torch_rand.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import os
-import json
-import torch
-import torch_npu
-
-
-ACLTRANSFORMER_HOME_PATH = os.environ.get("ACLTRANSFORMER_HOME_PATH")
-if ACLTRANSFORMER_HOME_PATH is None:
-    raise RuntimeError(
-        "env ACLTRANSFORMER_HOME_PATH not exist, source set_env.sh")
-
-LIB_PATH = os.path.join(ACLTRANSFORMER_HOME_PATH,
-                        "examples/libacltransformer_torch.so")
-torch.classes.load_library(LIB_PATH)
-
-
-class TestNormal(unittest.TestCase):
-    def test_2d(self):
-        param = '{"transposeA":false,"transposeB":true}'
-        operation = torch.classes.LinearOperationTorch.LinearOperationTorch(param)
-        operation.test()
-        a = torch.rand(384, 32, 1024).npu()
-        b = torch.rand(1024, 1024).npu()
-        c = torch.rand(1024).npu()
-
-        d = operation.execute(a, b, c)
-
-        golden_d = torch.matmul(a, torch.transpose(b, 0, 1)) + c
-
-        print("d:" + str(d.size()))
-        print("golden_d:" + str(golden_d.size()))
-
-        self.assertTrue(torch.allclose(d, golden_d, rtol=0.02, atol=0.02))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/examples/torch/ops/self_attention/self_attention_operation_torch.cpp b/examples/torch/ops/self_attention/self_attention_operation_torch.cpp
deleted file mode 100644
index 56d4aec3..00000000
--- a/examples/torch/ops/self_attention/self_attention_operation_torch.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "self_attention_operation_torch.h"
-#include <asdops/utils/log/log.h>
-#include "acltransformer/ops/self_attention_operation.h"
-#include "acltransformer/utils/tensor_util.h"
-#include "examples/utils/example_utils.h"
-#include <json/json.h>
-
-SelfAttentionOperationTorch::SelfAttentionOperationTorch(std::string param) : param_(param)
-{
-    ASD_LOG(INFO) << "SelfAttentionOperationTorch::SelfAttentionOperationTorch";
-    Json::Reader paramReader;
-    Json::Value paramJson;
-    if (!paramReader.parse(param, paramJson)) {
-        ASD_LOG(ERROR) << "json parse error";
-    }
-    AclTransformer::SelfAttentionParam selfAttentionParam;
-    selfAttentionParam.transKey = paramJson["transKey"].asBool();
-    selfAttentionParam.dk = paramJson["dk"].asInt();
-    selfAttentionParam.headNum = paramJson["headNum"].asInt();
-    this->selfAttentionParam_ = selfAttentionParam;
-    operation_ = new AclTransformer::SelfAttentionOperation(selfAttentionParam);
-}
-
-SelfAttentionOperationTorch::~SelfAttentionOperationTorch()
-{
-    if (operation_) {
-        delete operation_;
-        operation_ = nullptr;
-    }
-}
-
-void SelfAttentionOperationTorch::Test() { ASD_LOG(INFO) << "SelfAttentionOperationTorch::Test called"; }
-
-torch::Tensor SelfAttentionOperationTorch::Execute(torch::Tensor query, torch::Tensor key, torch::Tensor value,
-                                                   torch::Tensor attentionMask)
-{
-    query = query.contiguous();
-    key = key.contiguous();
-    value = value.contiguous();
-    attentionMask = attentionMask.contiguous();
-    torch::Tensor resultTensor = torch::zeros(query.sizes(), query.options()).contiguous();
-    ExecuteOperation(operation_, {&query, &key, &value, &attentionMask}, {&resultTensor});
-    ASD_LOG(INFO) << "SelfAttentionOperationTorch::Execute end";
-    return resultTensor;
-}
-
-TORCH_LIBRARY(SelfAttentionOperationTorch, m)
-{
-    m.class_<SelfAttentionOperationTorch>("SelfAttentionOperationTorch")
-        .def(torch::init<std::string>())
-        .def("test", &SelfAttentionOperationTorch::Test)
-        .def("execute", &SelfAttentionOperationTorch::Execute);
-}
\ No newline at end of file
diff --git a/examples/torch/ops/self_attention/self_attention_operation_torch.h b/examples/torch/ops/self_attention/self_attention_operation_torch.h
deleted file mode 100644
index ed7ad744..00000000
--- a/examples/torch/ops/self_attention/self_attention_operation_torch.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef SELF_ATTENTION_OPERATION_TORCH_H
-#define SELF_ATTENTION_OPERATION_TORCH_H
-#include <torch/script.h>
-#include <torch/custom_class.h>
-#include "acltransformer/ops/self_attention_operation.h"
-
-namespace AclTransformer {
-class SelfAttentionOperation;
-}
-
-class SelfAttentionOperationTorch : public torch::CustomClassHolder {
-public:
-    SelfAttentionOperationTorch(std::string param);
-    ~SelfAttentionOperationTorch();
-    void Test();
-    torch::Tensor Execute(torch::Tensor aquery, torch::Tensor key, torch::Tensor value, torch::Tensor attentionMask);
-    c10::intrusive_ptr<SelfAttentionOperationTorch> clone() const { return c10::make_intrusive<SelfAttentionOperationTorch>(param_); }
-
-private:
-    AclTransformer::SelfAttentionOperation *operation_ = nullptr;
-    std::string param_;
-    AclTransformer::SelfAttentionParam selfAttentionParam_;
-};
-
-#endif
\ No newline at end of file
diff --git a/examples/torch/ops/self_attention/test_self_attention_operation_torch.py b/examples/torch/ops/self_attention/test_self_attention_operation_torch.py
deleted file mode 100644
index a6bfcb40..00000000
--- a/examples/torch/ops/self_attention/test_self_attention_operation_torch.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright(C) 2023. Huawei Technologies Co.,Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import os
-import json
-import torch
-import torch_npu
-import sys
-sys.path.append('../..')
-from tensor_testcase import TensorTestCase
-
-ACLTRANSFORMER_HOME_PATH = os.environ.get("ACLTRANSFORMER_HOME_PATH")
-if ACLTRANSFORMER_HOME_PATH is None:
-    raise RuntimeError(
-        "env ACLTRANSFORMER_HOME_PATH not exist, source set_env.sh")
-
-LIB_PATH = os.path.join(ACLTRANSFORMER_HOME_PATH,
-                        "examples/libacltransformer_torch.so")
-torch.classes.load_library(LIB_PATH)
-
-
-# class TestNormal(unittest.TestCase):
-#     def test_2d(self):
-#         param = '{"transKey":true,"dk":64,"headNum":16}'
-#         operation = torch.classes.SelfAttentionOperationTorch.SelfAttentionOperationTorch(param)
-#         operation.test()
-#         query = torch.rand(32, 384, 1024).npu()
-#         key = torch.rand(32, 384, 1024).npu()
-#         value = torch.rand(32, 384, 1024).npu()
-#         mask = torch.rand(32, 1, 1, 384).npu()
-#         result = operation.execute(query, key, value, mask)
-#         print("result:" + str(result))
-
-
-class TestBert(unittest.TestCase):
-    def test_2d(self):
-        param = '{"transKey":false,"dk":64,"headNum":16}'
-        operation = torch.classes.SelfAttentionOperationTorch.SelfAttentionOperationTorch(param)
-        operation.test()
-        testcase = TensorTestCase('BertSelfAttention', in_tensor_num=7, out_tensor_num=6)
-        testcase.read(1)
-        in_tensors = testcase.get_in_tensors()
-        out_tensors = testcase.get_out_tensors()
-        query = in_tensors[4].npu()
-        key = in_tensors[5].npu()
-        value = in_tensors[6].npu()
-        mask = in_tensors[3].npu()
-        print(query.size())
-        print(key.size())
-        print(value.size())
-        print(mask.size())
-        d = operation.execute(query, key, value, mask)
-        # d = d.view(32, 384, 1024)
-        golden_d = out_tensors[0].npu()
-        print("d:" + str(d.size()))
-        print("golden_d:" + str(golden_d.size()))
-        print("d:" + str(d))
-        print("golden_d:" + str(golden_d))
-
-        self.assertTrue(torch.allclose(d, golden_d, rtol=0.02, atol=0.02))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/unittest/ops/add/test_add.cpp b/tests/unittest/ops/add/test_add.cpp
index 278931ca..f44931c7 100644
--- a/tests/unittest/ops/add/test_add.cpp
+++ b/tests/unittest/ops/add/test_add.cpp
@@ -23,9 +23,9 @@ TEST(TestAddOperation, InferShape)
 {
     AclTransformer::AddParam param;
     AclTransformer::AddOperation op(param);
-    std::vector<AsdOps::TensorDesc> inTensorDescs = {{AsdOps::TENSOR_DTYPE_FLOAT, AsdOps::TENSOR_FORMAT_ND, {1, 2}},
+    AsdOps::SVector<AsdOps::TensorDesc> inTensorDescs = {{AsdOps::TENSOR_DTYPE_FLOAT, AsdOps::TENSOR_FORMAT_ND, {1, 2}},
                                                      {AsdOps::TENSOR_DTYPE_FLOAT, AsdOps::TENSOR_FORMAT_ND, {1, 2}}};
-    std::vector<AsdOps::TensorDesc> outTensorDescs;
+    AsdOps::SVector<AsdOps::TensorDesc> outTensorDescs;
     op.InferShape(inTensorDescs, outTensorDescs);
     ASSERT_EQ(outTensorDescs.size(), 1);
     EXPECT_EQ(outTensorDescs.at(0).dtype, AsdOps::TENSOR_DTYPE_FLOAT);
-- 
Gitee