diff --git a/tf_adapter/kernels/amct_ascend_anti_quant.cc b/tf_adapter/kernels/amct_ascend_anti_quant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a731b9d76597c49506172ccd55cff2cd21ea4202
--- /dev/null
+++ b/tf_adapter/kernels/amct_ascend_anti_quant.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+
+template <typename T>
+class AscendAntiQuantOp : public OpKernel {
+public:
+    explicit AscendAntiQuantOp(OpKernelConstruction* context) : OpKernel(context){}
+
+    ~AscendAntiQuantOp(){}
+
+    void Compute(OpKernelContext* context) override{}
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("AscendAntiQuant").Device(tensorflow::DEVICE_CPU).TypeConstraint<float>("T"),
+    AscendAntiQuantOp<float>);
diff --git a/tf_adapter/kernels/amct_ascend_dequant.cc b/tf_adapter/kernels/amct_ascend_dequant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9713593181a467059faa8181fff90190c569707
--- /dev/null
+++ b/tf_adapter/kernels/amct_ascend_dequant.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+
+template <typename T>
+class AscendDequantOp : public OpKernel {
+public:
+    explicit AscendDequantOp(OpKernelConstruction* context) : OpKernel(context){}
+
+    ~AscendDequantOp(){}
+
+    void Compute(OpKernelContext* context) override{}
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("AscendDequant").Device(tensorflow::DEVICE_CPU).TypeConstraint<float>("T"),
+    AscendDequantOp<float>);
diff --git a/tf_adapter/kernels/amct_ascend_quant.cc b/tf_adapter/kernels/amct_ascend_quant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95ea17b63b5eca97b3715961949dd93fbce35b5d
--- /dev/null
+++ b/tf_adapter/kernels/amct_ascend_quant.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+
+template <typename T>
+class AscendQuantOp : public OpKernel {
+public:
+    explicit AscendQuantOp(OpKernelConstruction* context) : OpKernel(context){}
+
+    ~AscendQuantOp(){}
+
+    void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("AscendQuant").Device(tensorflow::DEVICE_CPU).TypeConstraint<float>("T"),
+    AscendQuantOp<float>);
diff --git a/tf_adapter/kernels/amct_ascend_weight_quant.cc b/tf_adapter/kernels/amct_ascend_weight_quant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f81533e5091c05705bc8c891fcb40ea674c5595d
--- /dev/null
+++ b/tf_adapter/kernels/amct_ascend_weight_quant.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+
+template <typename T>
+class AscendWeightQuantOp : public OpKernel {
+public:
+    explicit AscendWeightQuantOp(OpKernelConstruction* context) : OpKernel(context){}
+
+    ~AscendWeightQuantOp(){}
+
+    void Compute(OpKernelContext* context) override{}
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("AscendWeightQuant").Device(tensorflow::DEVICE_CPU).TypeConstraint<float>("T"),
+    AscendWeightQuantOp<float>);
diff --git a/tf_adapter/kernels/decode_image_ops.cc b/tf_adapter/kernels/decode_image_ops.cc
index 57e998be269c00dfe70903b94ef62e9475dc51a3..b7d033e36d136f097090fcf30c2ee577267a09e4 100644
--- a/tf_adapter/kernels/decode_image_ops.cc
+++ b/tf_adapter/kernels/decode_image_ops.cc
@@ -25,7 +25,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "ExternalSoftDp.h"
+#include "soft_dp/ExternalSoftDp.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tf_adapter/util/plugin_load_manager.h"
 #include <dlfcn.h>
diff --git a/tf_adapter/kernels/geop_dataset_op.cc b/tf_adapter/kernels/geop_dataset_op.cc
index 5092362493e9b64009cf89b126bcdd4e5ff2f9ff..f1c06ef6eebb83e330d4e7ae27bfc071759f96e8 100644
--- a/tf_adapter/kernels/geop_dataset_op.cc
+++ b/tf_adapter/kernels/geop_dataset_op.cc
@@ -34,7 +34,10 @@ namespace data {
 namespace {
 class GEOPDatasetOp : public DatasetOpKernel {
  public:
-  explicit GEOPDatasetOp(OpKernelConstruction *ctx) : DatasetOpKernel(ctx), f_handle_(kInvalidHandle) {
+  explicit GEOPDatasetOp(OpKernelConstruction *ctx)
+    : DatasetOpKernel(ctx),
+      f_handle_(kInvalidHandle),
+      lib_(nullptr) {
     FunctionMetadata::Params params;
     OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
   }
@@ -84,6 +87,9 @@ class GEOPDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "GEOPDatasetOp::Dataset"; }
 
+    GEOPDatasetOp *op_kernel_;
+    std::string tf_session_;
+
    protected:
     Status AsGraphDefInternal(SerializationContext *ctx, DatasetGraphDefBuilder *b, Node **output) const override {
       return Status::OK();
@@ -168,8 +174,6 @@ class GEOPDatasetOp : public DatasetOpKernel {
      private:
       mutex mu_;
     };
-    GEOPDatasetOp *op_kernel_;
-    std::string tf_session_;
   };
   std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
   FunctionLibraryRuntime::Handle f_handle_;
diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc
index 17ac5fef214e9479c5efd836b066cd36266a1814..a1218c02341c251ef16d0cb839bc6cbd17b8a609 100644
--- a/tf_adapter/kernels/geop_npu.cc
+++ b/tf_adapter/kernels/geop_npu.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include <fstream>
 #include <map>
 #include <memory>
+#include <mmpa/mmpa_api.h>
 #include <queue>
 #include <securec.h>
 #include <securectype.h>
@@ -57,9 +58,11 @@ limitations under the License.
 
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/types.h"
+#include "framework/omg/omg_inner_types.h"
 #include "framework/omg/parser/model_parser.h"
 #include "framework/omg/parser/parser_api.h"
 #include "framework/omg/parser/parser_factory.h"
+#include "framework/omg/parser/parser_inner_ctx.h"
 #include "ge/ge_api.h"
 #include "ge/ge_api_types.h"
 #include "tdt/tdt_host_interface.h"
@@ -116,8 +119,8 @@ Status BuildOutputTensorInfo(OpKernelContext *ctx, std::vector<ge::OutputTensorI
         auto err = memcpy_s(dst_ptr, SECUREC_MEM_MAX_LEN, src_ptr, SECUREC_MEM_MAX_LEN);
         if (err != EOK) {
           LOG(ERROR) << "[GEOP] Outputs mem copy failed, index:" << i << ", errret:" << err
-                     << ", dst_ptr:" << (int64_t) dst_ptr << ", dst_size:" << SECUREC_MEM_MAX_LEN
-                     << ", src_ptr:" << (int64_t) src_ptr << ", src_size:" << SECUREC_MEM_MAX_LEN;
+                     << ", dst_ptr:" << (uintptr_t) dst_ptr << ", dst_size:" << SECUREC_MEM_MAX_LEN
+                     << ", src_ptr:" << (uintptr_t) src_ptr << ", src_size:" << SECUREC_MEM_MAX_LEN;
           return errors::InvalidArgument("Outputs mem copy failed, index:", i);
         }
         left_size -= SECUREC_MEM_MAX_LEN;
@@ -125,11 +128,13 @@ Status BuildOutputTensorInfo(OpKernelContext *ctx, std::vector<ge::OutputTensorI
         dst_ptr = static_cast<void *>(static_cast<char *>(dst_ptr) + SECUREC_MEM_MAX_LEN);
         src_ptr = static_cast<void *>(static_cast<char *>(src_ptr) + SECUREC_MEM_MAX_LEN);
       }
+      REQUIRES_NOT_NULL(dst_ptr);
+      REQUIRES_NOT_NULL(src_ptr);
       auto err = memcpy_s(dst_ptr, left_size, src_ptr, left_size);
       if (err != EOK) {
         LOG(ERROR) << "[GEOP] Outputs mem copy failed, index:" << i << ", errret:" << err
-                   << ", dst_ptr:" << (int64_t) dst_ptr << ", dst_size:" << left_size
-                   << ", src_ptr:" << (int64_t) src_ptr << ", src_size:" << left_size;
+                   << ", dst_ptr:" << (uintptr_t)dst_ptr << ", dst_size:" << left_size
+                   << ", src_ptr:" << (uintptr_t)src_ptr << ", src_size:" << left_size;
         return errors::InvalidArgument("Outputs mem copy failed, index:", i);
       }
     }
@@ -160,9 +165,10 @@ const int kMaxCacheNum = 10;
 const int kFatalSleepTime = 3000;
 
 GeOp::GeOp(OpKernelConstruction *ctx)
-    : AsyncOpKernel(ctx), init_flag_(false), build_flag_(false), shape_flag_(false), add_graph_flag_(false),
-      sess_init_flag_(false), compute_graph_empty_(false), data_format_(""), graph_id_(0), cache_graph_id_(1),
-      is_initialized_graph_(false), need_iteration_(false), tf_session_(""), ge_session_(nullptr), job_type_("") {
+    : AsyncOpKernel(ctx), init_flag_(false), build_flag_(false), add_graph_flag_(false),
+      sess_init_flag_(false), compute_graph_empty_(false), data_format_(""), graph_id_(0),
+      is_initialized_graph_(false), need_iteration_(false), tf_session_(""), ge_session_(nullptr),
+      job_type_(""), is_host_graph_(false), is_train_graph_(false) {
   Initialize(ctx);
 }
 
@@ -192,7 +198,16 @@ void GeOp::Initialize(OpKernelConstruction *ctx) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr("_NpuOptimizer", &sess_config));
   std::map<std::string, std::string> init_options = NpuAttrs::GetInitOptions(ctx);
   std::map<std::string, std::string> pass_options = NpuAttrs::GetPassOptions(ctx);
+  iteration_per_loop_ = std::atoi(pass_options["iterations_per_loop"].c_str());
   job_type_ = pass_options["job"];
+  mstune_mode_ = init_options["ge.buildMode"];
+  work_path_ = init_options["ge.tuningPath"];
+  if (!mstune_mode_.empty() && !work_path_.empty()) {
+    handle_ = mmDlopen("libmstune_train.so", MMPA_RTLD_NOW);
+    OP_REQUIRES(ctx, handle_ != nullptr, errors::InvalidArgument("libmstune_train.so dlopen failed, ", mmDlerror()));
+    tuning_api_ = (MsTuningFunc)mmDlsym(handle_, const_cast<char*>("MsGradientTuning"));
+    OP_REQUIRES(ctx, tuning_api_ != nullptr, errors::InvalidArgument("dlsym MsGradientTuning API failed, ", mmDlerror()));
+  }
   if (GePlugin::GetInstance()->IsGlobal()) {
     LOG(INFO) << "[GEOP] GePlugin global, skip GePlugin init";
   } else {
@@ -227,15 +242,17 @@ void GeOp::Finalize() {
 
       if (!SessionManager::GetInstance().IsGeSessionExist()) {
         if (!GePlugin::GetInstance()->IsGlobal()) {
-          Status save_ret = GenerateReport::GetInstance()->SaveUnsupportedInfo();
-          if (save_ret != Status::OK()) {
-            LOG(WARNING) << "[GEOP] Save check report failed.";
-          }
           GePlugin::GetInstance()->Finalize();
           LOG(INFO) << "[GEOP] GePlugin Finalize success";
         } else {
           LOG(INFO) << "[GEOP] GePlugin global, skip GePlugin Finalize";
         }
+        if (!GenerateReport::GetInstance()->SaveUnsupportedInfo().ok()) {
+          LOG(WARNING) << "[GEOP] Save check report failed.";
+        }
+        if (!mstune_mode_.empty() && !work_path_.empty()) {
+          (void)mmDlclose(handle_);
+        }
       }
     }
   }
@@ -244,9 +261,9 @@ void GeOp::Finalize() {
   return;
 }
 
-int GeOp::InitRebuildFlag() {
+int GeOp::InitRebuildFlag(uint32_t cache_graph_id) {
   if (!build_flag_) {
-    LOG(INFO) << "[GEOP] tf session " << tf_session_ << ", graph id: " << cache_graph_id_
+    LOG(INFO) << "[GEOP] tf session " << tf_session_ << ", graph id: " << cache_graph_id
               << " does not build yet, no need to check rebuild";
     return 0;
   }
@@ -254,18 +271,18 @@ int GeOp::InitRebuildFlag() {
     LOG(ERROR) << "[GEOP] GE session is nullptr";
     return -1;
   }
-  if (!ge_session_->IsGraphNeedRebuild(cache_graph_id_)) {
-    LOG(INFO) << "[GEOP] tf session " << tf_session_ << ", graph id: " << cache_graph_id_ << " no need to rebuild";
+  if (!ge_session_->IsGraphNeedRebuild(cache_graph_id)) {
+    LOG(INFO) << "[GEOP] tf session " << tf_session_ << ", graph id: " << cache_graph_id << " no need to rebuild";
     return 0;
   }
 
-  LOG(INFO) << "[GEOP] The graph need rebuild, graph id " << cache_graph_id_;
+  LOG(INFO) << "[GEOP] The graph need rebuild, graph id " << cache_graph_id;
 
   // The graph need to rebuild, remove it from GE first.
-  LOG(INFO) << "[GEOP] tf session: " << tf_session_ << ", graph id: " << cache_graph_id_;
-  auto ret = ge_session_->RemoveGraph(cache_graph_id_);
+  LOG(INFO) << "[GEOP] tf session: " << tf_session_ << ", graph id: " << cache_graph_id;
+  auto ret = ge_session_->RemoveGraph(cache_graph_id);
   if (ret != ge::SUCCESS) {
-    LOG(ERROR) << "[GEOP] Failed to remove graph " << cache_graph_id_ << " from ge, error code " << ret;
+    LOG(ERROR) << "[GEOP] Failed to remove graph " << cache_graph_id << " from ge, error code " << ret;
     return -1;
   }
 
@@ -316,46 +333,39 @@ void GeOp::ClearGraphIdCount(std::string &tf_session) {
   if (it != session_and_graph_id_map_.end()) { session_and_graph_id_map_.erase(it); }
 }
 
-void GeOp::CacheShapeChangeGraphs() {
+void GeOp::GetExecGraphId(OpKernelContext *ctx, uint32_t &cache_graph_id,
+                          std::vector<std::string> input_shapes) {
   size_t num = cache_graphs_.size();
-  std::pair<std::map<std::vector<std::string>, uint32_t>::iterator, bool> ret;
-  uint32_t tmp_graph_id = 0;
-  if (num >= kMaxCacheNum) {
-    LOG(INFO) << "[GEOP] the cache vector size is : " << num << " , begin erase the least uesed";
-    std::sort(graph_counts_.begin(), graph_counts_.end(), CmpValue);
-    uint32_t erased_graph_id = cache_graphs_[graph_counts_[0].first];
-    cache_graphs_.erase(graph_counts_[0].first);
-    graph_counts_.erase(graph_counts_.begin());
-    ge::Status status = ge_session_->RemoveGraph(erased_graph_id);
-    if (status != ge::SUCCESS) { LOG(WARNING) << "[GEOP] GE Remove Graph failed, ret : " << ToString(status); }
-    ret = cache_graphs_.insert(std::make_pair(inputs_shape_string_, erased_graph_id));
-    tmp_graph_id = erased_graph_id;
-  } else {
-    ret = cache_graphs_.insert(std::make_pair(inputs_shape_string_, graph_id_ + num));
-    tmp_graph_id = graph_id_ + num;
-  }
-  if (ret.second) {
-    build_flag_ = false;
-    compute_graph_empty_ = false;
-    graph_counts_.push_back(std::make_pair(inputs_shape_string_, 1));
-    cache_graph_id_ = tmp_graph_id;
-  } else {
+  if (cache_graphs_.find(input_shapes) != cache_graphs_.end()) {
     for (auto &graph_count : graph_counts_) {
-      if (graph_count.first == inputs_shape_string_) {
+      if (graph_count.first == input_shapes) {
         graph_count.second += 1;
         break;
       }
     }
-    cache_graph_id_ = cache_graphs_[inputs_shape_string_];
+    cache_graph_id = cache_graphs_[input_shapes];
     build_flag_ = true;
-    shape_flag_ = false;
+  } else {
+    if (num >= kMaxCacheNum) {
+      LOG(INFO) << "[GEOP] the cache vector size is : " << num << " , begin erase the least uesed";
+      std::sort(graph_counts_.begin(), graph_counts_.end(), CmpValue);
+      uint32_t erased_graph_id = cache_graphs_[graph_counts_[0].first];
+      cache_graphs_.erase(graph_counts_[0].first);
+      graph_counts_.erase(graph_counts_.begin());
+      ge::Status status = ge_session_->RemoveGraph(erased_graph_id);
+      if (status != ge::SUCCESS) { LOG(WARNING) << "[GEOP] GE Remove Graph failed, ret : " << ToString(status); }
+      cache_graph_id = erased_graph_id;
+    } else {
+      cache_graph_id = graph_id_ + num;
+    }
+    build_flag_ = false;
+    compute_graph_empty_ = false;
   }
 }
 
 void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
   // ctx is not nullptr
   OP_REQUIRES_ASYNC(ctx, init_flag_, errors::InvalidArgument("GeOp not Initialize success."), done);
-
   // ge ge session
   if (!sess_init_flag_) {
     if (job_type_ != "localhost") {  // in ps mode : ctx->session_handle() is empty
@@ -371,7 +381,6 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
     {
       mutex_lock lock{mu_};
       bool res = IncrementGraphIdCount(tf_session_, graph_id_);
-      cache_graph_id_ = graph_id_;
       if (!res || graph_id_ < kInvalidGraphId) {
         OP_REQUIRES_ASYNC(ctx, false, errors::Unavailable("Get ge session failed."), done);
         return;
@@ -393,36 +402,32 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
   LOG(INFO) << "[GEOP] Begin GeOp::ComputeAsync"
             << ", kernel_name:" << geop_name << ", num_inputs:" << num_inputs << ", num_outputs:" << ctx->num_outputs();
   int64 startTime = InferShapeUtil::GetCurrentTimestap();
+  std::vector<std::string> input_shapes;
+  for (int i = 0; i < ctx->num_inputs(); i++) {
+    input_shapes.push_back(ctx->input(i).shape().DebugString());
+  }
 
-  if (!build_flag_) {
-    // record input shape
-    inputs_shape_string_.clear();
-    for (uint32_t i = 0; i < num_inputs; i++) {
-      Tensor input(ctx->input(i));
-      inputs_shape_string_.push_back(input.shape().DebugString());
-    }
-    cache_graphs_.insert(std::make_pair(inputs_shape_string_, cache_graph_id_));
-    graph_counts_.push_back(std::make_pair(inputs_shape_string_, 1));
-  } else if (inputs_shape_string_.size() == num_inputs) {
-    for (uint32_t i = 0; i < num_inputs; i++) {
-      if (inputs_shape_string_.at(i) == ctx->input(i).shape().DebugString()) {
-        continue;
-      } else {
-        // input_shape change, build GEOP for one more time
-        inputs_shape_string_.at(i) = ctx->input(i).shape().DebugString();
-        shape_flag_ = true;
-      }
+  // if input shapes changed, cache graphs
+  uint32_t cache_graph_id;
+  bool is_set_dynamic_config = !sess_options_["ge.inputShape"].empty() && !sess_options_["ge.dynamicDims"].empty();
+  bool is_tuning = !mstune_mode_.empty() && !work_path_.empty();
+  if (is_set_dynamic_config && is_tuning) {
+    LOG(FATAL) << "dynamic input config can not use with mstuning.";
+  } else if (is_set_dynamic_config && !is_tuning) {
+    cache_graph_id = graph_id_;
+    if (InitRebuildFlag(cache_graph_id) != 0) {
+      OP_REQUIRES_ASYNC(ctx, false, errors::Internal("Failed to check rebuild flag"), done);
+      return;
     }
-    if (shape_flag_) { CacheShapeChangeGraphs(); }
+  } else if (!is_set_dynamic_config && is_tuning) {
+    cache_graph_id = graph_id_;
   } else {
-    build_flag_ = false;
-    compute_graph_empty_ = false;
-  }
-
-  auto ret = InitRebuildFlag();
-  if (ret != 0) {
-    OP_REQUIRES_ASYNC(ctx, false, errors::Unavailable("Failed to check rebuild flag"), done);
-    return;
+    // if set dynamic input config, do not cache graphs.
+    GetExecGraphId(ctx, cache_graph_id, input_shapes);
+    if (InitRebuildFlag(cache_graph_id) != 0) {
+      OP_REQUIRES_ASYNC(ctx, false, errors::Internal("Failed to check rebuild flag"), done);
+      return;
+    }
   }
 
   if (!build_flag_) {
@@ -464,7 +469,7 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
     LOG(INFO) << "[GEOP] In GEOP computeAsync, kernel_name:" << geop_name << " ,TFadapter cost time: ["
               << ((endTime - startTime) / kMicrosToMillis) << " ms]";
     LOG(INFO) << "[GEOP] TFadpter process graph success, GE parser begin, kernel_name:" << geop_name
-              << " ,tf session: " << tf_session_ << " ,graph id :" << cache_graph_id_;
+              << " ,tf session: " << tf_session_ << " ,graph id :" << cache_graph_id;
     // parser,  tensorflow graph to ge graph
     std::shared_ptr<domi::ModelParser> model_parser =
         domi::ModelParserFactory::Instance()->CreateModelParser(domi::FrameworkType::TENSORFLOW);
@@ -486,8 +491,8 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
         return nullptr;
       }
       // get infershape
-      Graph subgraph(OpRegistry::Global());
-      Status status = InferShapeUtil::getSubGraphFromFunctionDef(*func_def, &subgraph);
+      Graph subgraph(flib_def);
+      Status status = InferShapeUtil::GetSubGraphFromFunctionDef(*flib_def, *func_def, &subgraph);
       if (status != Status::OK()) {
         LOG(ERROR) << "[GEOP] Get subgraph from functiondef fail.";
         return nullptr;
@@ -512,9 +517,6 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
       }
       subgraph.ToGraphDef(sub_graph_def.get());
 
-      // change function op to subgraph type
-      ChangeFunctionOpToSubgraph(*sub_graph_def.get(), *flib_def);
-
       unique_ptr<google::protobuf::Message> graph_def_out(std::move(sub_graph_def));
 
       char *need_print = getenv("PRINT_MODEL");
@@ -531,9 +533,10 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
         reinterpret_cast<google::protobuf::Message *>(&ori_graph_def), build_sub_graph, compute_graph);
     OP_REQUIRES_ASYNC(ctx, status == ge::SUCCESS, errors::Internal("graph parse failed, domi_ret : ", ToString(status)),
                       done);
+    domi::GetContext().format = ge::GetParserContext().format;
 
     LOG(INFO) << "[GEOP] Tensorflow graph parse to ge graph success, kernel_name:" << geop_name
-              << " ,tf session: " << tf_session_ << " ,graph id: " << cache_graph_id_;
+              << " ,tf session: " << tf_session_ << " ,graph id: " << cache_graph_id;
 
     size_t nodes = compute_graph->GetAllNodesSize();
     if (nodes == 0) {
@@ -542,41 +545,86 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
       int64 endTime = InferShapeUtil::GetCurrentTimestap();
       LOG(INFO) << "[GEOP] End GeOp::ComputeAsync, compute_graph is empty, kernel_name:" << geop_name
                 << ", ret_status:" << ToString(ge::SUCCESS) << " , tf session: " << tf_session_
-                << " ,graph id: " << cache_graph_id_ << " [" << ((endTime - startTime) / kMicrosToMillis) << " ms]";
+                << " ,graph id: " << cache_graph_id << " [" << ((endTime - startTime) / kMicrosToMillis) << " ms]";
       done();
       return;
     }
 
     // convert to ge::graph
     ge::Graph ge_graph = ge::GraphUtils::CreateGraphFromComputeGraph(compute_graph);
-    ge_graph.SetNeedIteration(this->need_iteration_);
+    if (iteration_per_loop_ > 1) {
+      ge_graph.SetNeedIteration(this->need_iteration_);
+    }
+
+    if (is_host_graph_) {
+      LOG(INFO) << "[GEOP] set graph option.";
+      graph_options_["ge.exec.placement"] = "HOST";
+    }
+    if (is_tuning) {
+      if (!is_train_graph_) {
+        LOG(INFO) << "[GEOP] in tune mode, nontraining graphs should be cache.";
+        OP_REQUIRES_ASYNC(ctx, SessionManager::GetInstance().CacheGeGraphs(ge_session_, ge_graph),
+          errors::Internal("[GEOP] cache ge session failed."), done);
+        build_flag_ = true;
+        BuildOutTensorInfo(ctx);
+        done();
+        return;
+      } else {
+        LOG(INFO) << "[GEOP] in tune mode, training graph handled by tools.";
+        uint32_t device_id = 0;
+        OP_REQUIRES_OK_ASYNC(ctx, GetEnvDeviceID(device_id), done);
+        std::map<string, string> tune_options = {{"work_path", work_path_},
+                                                 {"job_type", mstune_mode_},
+                                                 {"devices", std::to_string(device_id)}};
+        std::vector<ge::Graph> ge_graphs;
+        OP_REQUIRES_ASYNC(ctx, SessionManager::GetInstance().GetGeGraphs(ge_session_, ge_graphs),
+          errors::Internal("[GEOP] ge ge session nontraining graphs failed."), done);
+        MsTuneStatus tune_ret = (*tuning_api_)(ge_graph, ge_graphs, ge_session_, tune_options);
+        OP_REQUIRES_ASYNC(ctx, tune_ret == MSTUNE_SUCCESS, errors::Internal("[GEOP] exec msTuning func failed."), done);
+        LOG(INFO) << "[GEOP] msTuning success.";
+        build_flag_ = true;
+        BuildOutTensorInfo(ctx);
+        done();
+        return;
+      }
+    }
 
     // call ge session addGraph api
-    status = ge_session_->AddGraph(cache_graph_id_, ge_graph);
+    status = ge_session_->AddGraph(cache_graph_id, ge_graph, graph_options_);
     if (status != ge::SUCCESS) {
       std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime));
       LOG(FATAL) << "[GEOP] call ge session add graph failed, kernel: " << geop_name << " ,tf session: " << tf_session_
-                 << ", graph id: " << cache_graph_id_;
+                 << ", graph id: " << cache_graph_id;
       OP_REQUIRES_ASYNC(ctx, status == ge::SUCCESS,
                         errors::Unavailable("[GEOP] GE session add graph failed, domi_ret : ", ToString(status)), done);
     } else {
       add_graph_flag_ = true;
       LOG(INFO) << "[GEOP] Add graph to ge session success, kernel_name:" << geop_name
-                << " ,tf session: " << tf_session_ << " ,graph id:" << cache_graph_id_;
+                << " ,tf session: " << tf_session_ << " ,graph id:" << cache_graph_id;
     }
-
     build_flag_ = true;
+    if (!is_set_dynamic_config) {
+      cache_graphs_.insert(std::make_pair(input_shapes, cache_graph_id));
+      graph_counts_.push_back(std::make_pair(input_shapes, 1));
+    }
   } else {
     if (compute_graph_empty_) {
       int64 endTime = InferShapeUtil::GetCurrentTimestap();
       LOG(INFO) << "[GEOP] End GeOp::ComputeAsync, compute_graph is empty, kernel_name:" << geop_name
                 << ", ret_status:" << ToString(ge::SUCCESS) << " , tf session: " << tf_session_
-                << " ,graph id: " << cache_graph_id_ << " [" << ((endTime - startTime) / kMicrosToMillis) << " ms]";
+                << " ,graph id: " << cache_graph_id << " [" << ((endTime - startTime) / kMicrosToMillis) << " ms]";
       done();
       return;
     }
   }
 
+  if (is_tuning) {
+    LOG(INFO) << "in mstune mode, graph only execute once, The remaining steps return directly.";
+    BuildOutTensorInfo(ctx);
+    done();
+    return;
+  }
+
   int64 run_start_time = InferShapeUtil::GetCurrentTimestap();
   auto callback = [done, ctx, run_start_time](ge::Status ge_status, std::vector<ge::OutputTensorInfo> &outputs) {
     if (ge_status == ge::SUCCESS) {
@@ -601,36 +649,24 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
   OP_REQUIRES_OK_ASYNC(ctx, (BuildInputTensorInfo(ctx, inputs)), done);
 
   LOG(INFO) << "[GEOP] Call ge session RunGraphAsync, kernel_name:" << geop_name << " ,tf session: " << tf_session_
-            << " ,graph id: " << cache_graph_id_;
+            << " ,graph id: " << cache_graph_id;
   // call ge session runGraphAsync api
-  ge::Status status = ge_session_->RunGraphAsync(cache_graph_id_, inputs, callback);
+  ge::Status status = ge_session_->RunGraphAsync(cache_graph_id, inputs, callback);
   if (status != ge::SUCCESS) {
     std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime));
     LOG(FATAL) << "[GEOP] call ge session RunGraphAsync Failed, kernel:" << geop_name << " ,tf session: " << tf_session_
-               << " ,graph id: " << cache_graph_id_;
+               << " ,graph id: " << cache_graph_id;
   }
   OP_REQUIRES_ASYNC(ctx, status == ge::SUCCESS,
                     errors::Unavailable("ge session run graph failed, ret_status:", ToString(status)), done);
 
   int64 endTime = InferShapeUtil::GetCurrentTimestap();
   LOG(INFO) << "[GEOP] End GeOp::ComputeAsync, kernel_name:" << geop_name << ", ret_status:" << ToString(status)
-            << " ,tf session: " << tf_session_ << " ,graph id: " << cache_graph_id_ << " ["
+            << " ,tf session: " << tf_session_ << " ,graph id: " << cache_graph_id << " ["
             << ((endTime - startTime) / kMicrosToMillis) << " ms]";
   return;
 }
 
-void GeOp::ChangeFunctionOpToSubgraph(GraphDef &sub_graph_def, const FunctionLibraryDefinition &flib_def) {
-  std::vector<string> function_names = flib_def.ListFunctionNames();
-  for (NodeDef &node_def : *sub_graph_def.mutable_node()) {
-    for (string func_name : function_names) {
-      if (node_def.op() == func_name) {
-        node_def.set_op(SubGraph);
-        LOG(INFO) << "Node " << node_def.name() << " change op type from " << func_name << " to " << SubGraph;
-      }
-    }
-  }
-}
-
 void GeOp::AddNodeAttrs(Node *node, bool &is_initialize) {
   // Add dp custom kernel label
   if (node->type_string() == "IteratorGetNext") { node->AddAttr("_kernel", "dp"); }
@@ -643,6 +679,13 @@ void GeOp::AddNodeAttrs(Node *node, bool &is_initialize) {
     this->need_iteration_ = true;
     LOG(INFO) << "subgraph  has iteration op.";
   }
+  if (node->name() == "var_in_host") {
+    is_host_graph_ = true;
+    LOG(INFO) << "[GeOp] variable subgraph is initialized in host.";
+  }
+  if (node->name().find("_Allreduce") != string::npos) {
+    is_train_graph_ = true;
+  }
   // clear device info && attr
   node_def.set_device("");
   if (node_def.op() == "Const") {
@@ -660,10 +703,31 @@ void GeOp::BuildGraphDef(OpKernelContext *ctx, DoneCallback done, const Function
   Graph graph(OpRegistry::Global());
   OP_REQUIRES_OK_ASYNC(ctx, InferShapeUtil::InferShape(input_vec, &flib_def, &func_def, &graph), done);
 
+  bool is_tuning = !mstune_mode_.empty() && !work_path_.empty();
   for (Node *node : graph.nodes()) {
     AddNodeAttrs(node, is_initialize);
     // Add Input&Output Desc into NodeDef
     OP_REQUIRES_OK_ASYNC(ctx, this->GenerateDesc(node), done);
+    if (is_tuning) {
+      // output handle
+      NodeDef &node_def = const_cast<NodeDef &>(node->def());
+      if (node->type_string() == "_Retval") {
+        int index = node_def.attr().at("index").i();
+        // format: AttrValue.list(ListValue).func(repeated NameAttrList)
+        NameAttrList desc_attr = node_def.attr().at(INPUT_DESC).list().func(0);
+
+        std::vector<int64> dims;
+        int dim_num = desc_attr.attr().at(SERIALIZE_SHAPE).list().i_size();
+        for (int t = 0; t < dim_num; t++) {
+          int64 dim_i = (int64_t) desc_attr.attr().at(SERIALIZE_SHAPE).list().i(t);
+          if (dim_i < 0) { dim_i = 1; }
+          dims.push_back(dim_i);
+        }
+
+        TensorShape out_shape(dims);
+        outputs_shape_.insert(std::map<int, TensorShape>::value_type(index, out_shape));
+      }
+    }
   }
 
   graph.ToGraphDef(&graph_def);
@@ -700,6 +764,17 @@ Status GeOp::BuildInputTensorInfo(OpKernelContext *ctx, std::vector<ge::InputTen
   return Status::OK();
 }
 
+Status GeOp::BuildOutTensorInfo(OpKernelContext *ctx) {
+  int num_outputs = ctx->num_outputs();
+    // populate outputs
+  for (int i = 0; i < num_outputs; i++) {
+    TensorShape out_shape = outputs_shape_.at(i);
+    Tensor *tensor = nullptr;
+    TF_RETURN_IF_ERROR(ctx->allocate_output(i, out_shape, &tensor));
+  }
+  return Status::OK();
+}
+
 // For each NodeDef, Create Input&Output Desc(shape,format,dataType)
 Status GeOp::GenerateDesc(Node *&node) {
   REQUIRES_NOT_NULL(node);
diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h
index 6cfb6f339558d52152db2f4a56d8b01bdcb2e041..f205c7be7eda795f972d61b82d9999574c349825 100644
--- a/tf_adapter/kernels/geop_npu.h
+++ b/tf_adapter/kernels/geop_npu.h
@@ -38,9 +38,12 @@ limitations under the License.
 #include "ge/ge_api_types.h"
 #include "graph/tensor.h"
 #include "graph/utils/graph_utils.h"
+#include "toolchain/tuning_tool/tune_api.h"
 #include <unordered_map>
 
 namespace tensorflow {
+using MsTuningFunc = MsTuneStatus (*)(ge::Graph &, std::vector<ge::Graph> &, ge::Session *,
+                                std::map<std::string, std::string> &);
 class GeOp : public AsyncOpKernel {
  public:
   explicit GeOp(OpKernelConstruction *ctx);
@@ -60,12 +63,12 @@ class GeOp : public AsyncOpKernel {
                      const FunctionDef &func_def, const std::vector<Tensor> &input_vec, GraphDef &graph_def,
                      bool &is_initialize);
 
-  // Find and change op type to SubGraph
-  void ChangeFunctionOpToSubgraph(GraphDef &graph_def, const FunctionLibraryDefinition &flib_def);
-
   // prepare input tensor
   Status BuildInputTensorInfo(OpKernelContext *ctx, std::vector<ge::InputTensorInfo> &inputs);
 
+  // prepare output tensor
+  Status BuildOutTensorInfo(OpKernelContext *ctx);
+
   // create input and output desc for NodeDef
   Status GenerateDesc(Node *&node);
 
@@ -74,7 +77,7 @@ class GeOp : public AsyncOpKernel {
  private:
   void AddNodeAttrs(Node *node, bool &is_initialize);
 
-  int InitRebuildFlag();
+  int InitRebuildFlag(uint32_t cache_graph_id);
 
   bool IncrementGraphIdCount(std::string &tf_session, uint32_t &graph_id);
 
@@ -82,7 +85,8 @@ class GeOp : public AsyncOpKernel {
 
   void ClearGraphIdCount(std::string &tf_session);
 
-  void CacheShapeChangeGraphs();
+  void GetExecGraphId(OpKernelContext *ctx, uint32_t &cache_graph_id,
+                      std::vector<std::string> input_shapes);
 
  private:
   static const std::string INPUT_DESC;
@@ -96,7 +100,6 @@ class GeOp : public AsyncOpKernel {
 
   bool init_flag_;
   bool build_flag_;
-  bool shape_flag_;
   bool add_graph_flag_;
   bool sess_init_flag_;
   bool compute_graph_empty_;
@@ -104,17 +107,24 @@ class GeOp : public AsyncOpKernel {
   NameAttrList function_;
   std::string data_format_;
   uint32_t graph_id_;
-  uint32_t cache_graph_id_;
   bool is_initialized_graph_;
   bool need_iteration_;
   std::string tf_session_;
   ge::Session *ge_session_;
   std::string job_type_;
-  std::vector<std::string> inputs_shape_string_;
   std::map<std::vector<std::string>, uint32_t> cache_graphs_;
   std::vector<std::pair<std::vector<std::string>, uint32_t>> graph_counts_;
   std::map<std::string, std::string> sess_options_;
   static std::unordered_map<std::string, uint32_t> session_and_graph_id_map_;
+  uint32_t iteration_per_loop_;
+  bool is_host_graph_;
+  std::map<std::string, std::string> graph_options_;
+  string work_path_;
+  string mstune_mode_;
+  std::map<int, TensorShape> outputs_shape_;
+  bool is_train_graph_;
+  void *handle_;
+  MsTuningFunc tuning_api_;
 };
 }  // namespace tensorflow
 #endif  // TENSORFLOW_KERNELS_GEOP_NPU_H_
diff --git a/tf_adapter/kernels/hccl_ops.cc b/tf_adapter/kernels/hccl_ops.cc
index ee95b4a5517fdbb754ae32a0ddd8298d2aedfa42..30f8d9dff862edb91584d11984442ba4c38db5a2 100644
--- a/tf_adapter/kernels/hccl_ops.cc
+++ b/tf_adapter/kernels/hccl_ops.cc
@@ -81,4 +81,28 @@ class HcomReceiveOpKernel : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("HcomReceive").Device(DEVICE_CPU), HcomReceiveOpKernel);
-}  // namespace tensorflow
+
+class HcomRemoteReadOpKernel : public OpKernel {
+public:
+    explicit HcomRemoteReadOpKernel(OpKernelConstruction* context) : OpKernel(context) {}
+    ~HcomRemoteReadOpKernel() {}
+    void Compute(OpKernelContext* context) override
+    {
+        LOG(INFO) << "HcomRemoteReadOpKernel Compute.";
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("HcomRemoteRead").Device(DEVICE_CPU), HcomRemoteReadOpKernel);
+
+class HcomRemoteWriteKernel : public OpKernel {
+public:
+    explicit HcomRemoteWriteKernel(OpKernelConstruction* context) : OpKernel(context) {}
+    ~HcomRemoteWriteKernel() {}
+    void Compute(OpKernelContext* context) override
+    {
+        LOG(INFO) << "HcomRemoteWriteKernel Compute.";
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("HcomRemoteWrite").Device(DEVICE_CPU), HcomRemoteWriteKernel);
+} // namespace tensorflow
diff --git a/tf_adapter/kernels/host_queue_dataset_op.cc b/tf_adapter/kernels/host_queue_dataset_op.cc
index b7d51c3331a6575579af5fba5a47c0b44c49fc89..798b59024ac4c72a3a478635c949848639b9fc5b 100644
--- a/tf_adapter/kernels/host_queue_dataset_op.cc
+++ b/tf_adapter/kernels/host_queue_dataset_op.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tf_adapter/common/common.h"
+#include "tf_adapter/util/npu_attrs.h"
 #include <dlfcn.h>
 #include <thread>
 #include <vector>
@@ -49,7 +50,6 @@ namespace {
 using namespace std;
 using namespace tdt;
 
-const static int kMaxDeviceId = 7;
 const static uint32_t kMaxValue = 128;
 // total memory usage controlled below 2G
 const uint64_t kTotalBytes = 2147483648;
@@ -67,27 +67,16 @@ class HostQueueDatasetOp : public DatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     LOG(INFO) << "Start to init tdt.";
-    string lib_path = "libdatatransfer.so";
-    handle_ = dlopen(lib_path.c_str(), RTLD_NOW);
-    OP_REQUIRES(ctx, handle_ != nullptr, errors::InvalidArgument("libdatatransfer.so dlopen failed."));
-    init_api_ = (InitFunc) dlsym(handle_, "TdtHostInit");
-    push_api_ = (PushDataFunc) dlsym(handle_, "TdtHostPushData");
-    destroy_api_ = (DestroyFunc) dlsym(handle_, "TdtHostDestroy");
-    OP_REQUIRES(ctx, init_api_ != nullptr && push_api_ != nullptr && destroy_api_ != nullptr,
-                errors::InvalidArgument("dlsym tdt API failed."));
-    int64 id = -1;
-    OP_REQUIRES_OK(ctx, ReadInt64FromEnvVar("DEVICE_ID", -1, &id));
-
-    OP_REQUIRES(ctx, id >= 0 && id <= kMaxDeviceId, errors::InvalidArgument("device_id should be in [0, 7]."));
-    uint32_t u_id = (uint32_t) id;
-    int32_t tdt_status = (*init_api_)(u_id);
+    uint32_t device_id = 0;
+    OP_REQUIRES_OK(ctx, GetEnvDeviceID(device_id));
+    int32_t tdt_status = TdtHostInit(device_id);
     OP_REQUIRES(ctx, tdt_status == 0, errors::InvalidArgument("Tdt client init failed."));
     tdt_release = false;
   }
   ~HostQueueDatasetOp() {
     LOG(INFO) << "Start to destroy tdt.";
     if (!tdt_release) {
-      int32_t tdt_status = (*destroy_api_)();
+      int32_t tdt_status = TdtHostDestroy();
       if (tdt_status != 0) {
         LOG(ERROR) << "Tdt client close failed.";
       } else {
@@ -95,12 +84,6 @@ class HostQueueDatasetOp : public DatasetOpKernel {
         tdt_release = true;
       }
     }
-    if (handle_ != nullptr) {
-      dlclose(handle_);
-      LOG(INFO) << "dlclose handle finish.";
-    } else {
-      LOG(INFO) << "handle is null.";
-    }
   }
   void MakeDataset(OpKernelContext *ctx, DatasetBase **output) override {
     std::vector<DatasetBase *> inputs;
@@ -110,7 +93,7 @@ class HostQueueDatasetOp : public DatasetOpKernel {
       OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(i), &input));
       inputs.push_back(input);
     }
-    *output = new (nothrow) Dataset(ctx, this, inputs, channel_name_, output_types_, output_shapes_);
+    *output = new (nothrow) Dataset(ctx, inputs, channel_name_, output_types_, output_shapes_);
     OP_REQUIRES(ctx, *output != nullptr,
                 errors::InvalidArgument("Data process host queue dataset op: new dataset failed."));
   }
@@ -118,11 +101,10 @@ class HostQueueDatasetOp : public DatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext *ctx, HostQueueDatasetOp *op_kernel, const std::vector<DatasetBase *> &inputs,
-            const string &channelName, const DataTypeVector &outputTypes,
-            const vector<PartialTensorShape> &outputShapes)
-        : DatasetBase(DatasetContext(ctx)), op_kernel_(op_kernel), inputs_(inputs), channel_name_(channelName),
-          output_types_(outputTypes), output_shapes_(outputShapes) {
+    Dataset(OpKernelContext *ctx, const std::vector<DatasetBase *> &inputs, const string &channelName,
+            const DataTypeVector &outputTypes, const vector<PartialTensorShape> &outputShapes)
+        : DatasetBase(DatasetContext(ctx)), inputs_(inputs), channel_name_(channelName), output_types_(outputTypes),
+          output_shapes_(outputShapes) {
       for (const auto &input : inputs_) { input->Ref(); }
     }
 
@@ -130,8 +112,6 @@ class HostQueueDatasetOp : public DatasetOpKernel {
       for (const auto &input : inputs_) { input->Unref(); }
     }
 
-    HostQueueDatasetOp *kernel() const { return op_kernel_; }
-
     unique_ptr<IteratorBase> MakeIteratorInternal(const string &prefix) const override {
       return unique_ptr<IteratorBase>(new (nothrow) Iterator({this, strings::StrCat(prefix, "::HostQueue")}));
     }
@@ -205,7 +185,7 @@ class HostQueueDatasetOp : public DatasetOpKernel {
             mutex_lock lck(mu_);
             for (auto &tensor : args) {
               if (tensor.TotalBytes() > UINT64_MAX - total_bytes_) {
-                LOG(ERROR) << "The size of tensor is too big";
+                LOG(ERROR) << "the size of tensor is too big";
                 buffer_element.host_thread_finished = true;
                 buffer_.push_back(std::move(buffer_element));
                 cond_var_.notify_all();
@@ -221,18 +201,20 @@ class HostQueueDatasetOp : public DatasetOpKernel {
       }
       void SendDataThread(const std::shared_ptr<IteratorContext> &ctx) {
         vector<Tensor> args;
-        while (!cancelled_) {
+        while (true) {
           {
             mutex_lock lck(mu_);
-            if (buffer_.empty()) {
+            while (!cancelled_ && !finish_send_ && buffer_.empty()) {
               RecordStop(ctx.get());
               cond_var_.wait(lck);
               RecordStart(ctx.get());
             }
-          }
-
-          {
-            mutex_lock l(mu_);
+            if (cancelled_ || finish_send_) {
+              LOG(INFO) << "Host queue " << dataset()->channel_name_
+                        << " push data thread exit with cancelled: " << cancelled_ << ", finished:" << finish_send_
+                        << " when wait data.";
+              return;
+            }
             if (buffer_.front().host_thread_finished) {
               std::vector<DataItem> items;
               DataItem end_item;
@@ -241,16 +223,17 @@ class HostQueueDatasetOp : public DatasetOpKernel {
                 LOG(INFO) << "Push data finish, end_of_sequence_ is true.";
               } else {
                 end_item.dataType_ = TDT_ABNORMAL;
-                LOG(ERROR) << "Get data failed.";
+                LOG(ERROR) << "Get data failed " << buffer_.front().status.ToString();
               }
               items.emplace_back(end_item);
-              int32_t tdt_status = (*(dataset()->kernel()->push_api_))(dataset()->channel_name_, items);
-              if (tdt_status != 0) { LOG(ERROR) << "Push the end data to tdt failed."; }
+              int32_t tdt_status = TdtHostPushData(dataset()->channel_name_, items);
+              if (tdt_status != 0) { LOG(INFO) << "End training as tdt host push end data failed " << tdt_status; }
               cancelled_ = true;
               cond_var_.notify_all();
               return;
             } else {
               args = buffer_.front().value;
+              buffer_.pop_front();
             }
           }
 
@@ -269,7 +252,8 @@ class HostQueueDatasetOp : public DatasetOpKernel {
                   std::shared_ptr<void>(const_cast<char *>(tensor.tensor_data().data()), [](void *elem) {});
             } else if (tensor.dtype() == DT_STRING) {
               if (tensor.dims() != 0) {
-                LOG(ERROR) << "Input of DT_STRING type should be scalar, current dims:"
+                LOG(ERROR) << "input of DT_STRING type should be scalar,"
+                              " current dims:"
                            << tensor.dims();
                 mutex_lock lck(mu_);
                 cancelled_ = true;
@@ -280,30 +264,29 @@ class HostQueueDatasetOp : public DatasetOpKernel {
               data_item.dataLen_ = value.size();
               data_item.dataPtr_ = std::shared_ptr<void>(const_cast<char *>(value.data()), [](void *elem) {});
             } else {
-              LOG(ERROR) << "Unexpected data type.";
+              LOG(ERROR) << "Unexpected data type " << DataTypeString(tensor.dtype());
               mutex_lock lck(mu_);
               cancelled_ = true;
               cond_var_.notify_all();
               return;
             }
             items.push_back(data_item);
-            // total_bytes is smaller than total_bytes_
             total_bytes += tensor.TotalBytes();
           }
           // call tdt interface
-          int32_t tdt_status = (*(dataset()->kernel()->push_api_))(dataset()->channel_name_, items);
-          if (tdt_status != 0 || cancelled_ || finish_send_) {
+          int32_t tdt_status = TdtHostPushData(dataset()->channel_name_, items);
+          if (tdt_status != 0) {
+            LOG(INFO) << "End training as tdt host push data failed " << tdt_status;
             mutex_lock lck(mu_);
             cancelled_ = true;
-            LOG(INFO) << "End training.";
             cond_var_.notify_all();
             return;
           }
-          mutex_lock lck(mu_);
-          buffer_.pop_front();
-          // total_bytes is smaller than total_bytes_
-          total_bytes_ -= total_bytes;
-          cond_var_.notify_all();
+          {
+            mutex_lock lck(mu_);
+            total_bytes_ -= total_bytes;
+            cond_var_.notify_all();
+          }
         }
       }
 
@@ -383,14 +366,16 @@ class HostQueueDatasetOp : public DatasetOpKernel {
       condition_variable cond_var_;
       string prefix_end_;
       std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-      std::unique_ptr<Thread> receive_thread_ GUARDED_BY(mu_);
-      std::unique_ptr<Thread> send_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
       bool finish_send_ GUARDED_BY(mu_) = false;
       bool host_thread_finished_ GUARDED_BY(mu_) = false;
       uint64_t total_bytes_ GUARDED_BY(mu_) = 0;
+      // The following two thread must be the first member to be destructed, because tensorflow::Thread does not provide
+      // an explicit join function. If the thread is destructed after other members, such as buffer_, when the thread
+      // joins, it will access the already destructed buffer_ , Resulting in an unknown error.
+      std::unique_ptr<Thread> receive_thread_ GUARDED_BY(mu_);
+      std::unique_ptr<Thread> send_thread_ GUARDED_BY(mu_);
     };
-    HostQueueDatasetOp *op_kernel_;
     const std::vector<DatasetBase *> inputs_;
     std::string channel_name_;
     const DataTypeVector output_types_;
@@ -399,10 +384,6 @@ class HostQueueDatasetOp : public DatasetOpKernel {
   std::string channel_name_;
   DataTypeVector output_types_;
   vector<PartialTensorShape> output_shapes_;
-  void *handle_;
-  InitFunc init_api_;
-  PushDataFunc push_api_;
-  DestroyFunc destroy_api_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("HostQueueDataset").Device(DEVICE_CPU), HostQueueDatasetOp);
diff --git a/tf_adapter/kernels/npu_aicore_ops.cc b/tf_adapter/kernels/npu_aicore_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dad767d8e5774760082d609331d4482dc6936bfd
--- /dev/null
+++ b/tf_adapter/kernels/npu_aicore_ops.cc
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. foss@huawei.com
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tf_adapter/common/common.h"
+
+namespace tensorflow {
+template<typename T>
+class FastGeluOp : public tensorflow::OpKernel {
+ public:
+  explicit FastGeluOp(tensorflow::OpKernelConstruction *context)
+      : OpKernel(context) {}
+  ~FastGeluOp() {}
+  void Compute(tensorflow::OpKernelContext *context) override {
+    // Grab the input tensor
+    CHECK_NOT_NULL(context);
+    const Tensor &input_tensor = context->input(0);
+
+    // Create an output tensor
+    Tensor *output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
+                                                     &output_tensor));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FastGelu")
+.
+Device(tensorflow::DEVICE_CPU)
+.TypeConstraint<float>("T"),
+FastGeluOp<float>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("FastGelu")
+.
+Device(tensorflow::DEVICE_CPU)
+.TypeConstraint<double>("T"),
+FastGeluOp<double>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("FastGelu")
+.
+Device(tensorflow::DEVICE_CPU)
+.TypeConstraint<Eigen::half>("T"),
+FastGeluOp<Eigen::half>);
+
+template<typename T>
+class FastGeluGradOp : public tensorflow::OpKernel {
+ public:
+  explicit FastGeluGradOp(tensorflow::OpKernelConstruction *context)
+      : OpKernel(context) {}
+  ~FastGeluGradOp() {}
+  void Compute(tensorflow::OpKernelContext *context) override {
+    // Grab the grad input tensor
+    CHECK_NOT_NULL(context);
+    const Tensor &grad_input_tensor = context->input(0);
+    auto grad_input = grad_input_tensor.flat<T>();
+
+    // Grab the input tensor
+    const Tensor &input_tensor = context->input(1);
+    auto input = input_tensor.flat<T>();
+
+    OP_REQUIRES(
+        context, grad_input.size() == input.size(),
+        errors::InvalidArgument("grad_input size is not equal input size"));
+
+    // Create an output tensor
+    Tensor *grad_output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, grad_input_tensor.shape(),
+                                                     &grad_output_tensor));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FastGeluGrad")
+.
+Device(tensorflow::DEVICE_CPU)
+.TypeConstraint<float>("T"),
+FastGeluGradOp<float>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("FastGeluGrad")
+.
+Device(tensorflow::DEVICE_CPU)
+.TypeConstraint<double>("T"),
+FastGeluGradOp<double>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("FastGeluGrad")
+.
+Device(tensorflow::DEVICE_CPU)
+.TypeConstraint<Eigen::half>("T"),
+FastGeluGradOp<Eigen::half>);
+}  // namespace tensorflow
+
+
diff --git a/tf_adapter/kernels/unique_parallel_ops.cc b/tf_adapter/kernels/unique_parallel_ops.cc
index 3bc3601548e5a9c30843cf212b7befbe9c2a6e61..5bb1c395d4f1a2245fa9cebb6ae3df44dbc35583 100755
--- a/tf_adapter/kernels/unique_parallel_ops.cc
+++ b/tf_adapter/kernels/unique_parallel_ops.cc
@@ -63,7 +63,7 @@ class UniqueParallelOp : public OpKernel {
     unique_map_vec.resize(cpu_nums);
     std::function<void(int64, int)> shards = [&](int64 total, int cur) {
       for (TIndex i = 0; i < total; i++){
-        if ((input_vec(i) & (cpu_nums-1)) == cur) {
+        if ((input_vec(i) & 15) == cur) {
           if (unique_map_vec[cur].find(input_vec(i)) == unique_map_vec[cur].end()) {
             unique_map_vec[cur][input_vec(i)] = count_num++;
           }
@@ -88,7 +88,7 @@ class UniqueParallelOp : public OpKernel {
   }
  private:
   void ParallelFor(tensorflow::thread::ThreadPool& thread_work, 
-    int64 total, int cpu_nums, std::function<void(int64, int)>& fn) {
+    int64 total, const int cpu_nums, std::function<void(int64, int)>& fn) {
     CHECK_GE(total, 0);
     CHECK_EQ(total, (int64)(Eigen::Index)total);
     if (total <= 1 || cpu_nums == 1) {
@@ -108,28 +108,21 @@ class UniqueParallelOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("Unique")                     
-                       .TypeConstraint<int64>("T")                            
-                       .TypeConstraint<int32>("out_idx")                      
-                       .Device(DEVICE_CPU)                                   
-                       .Label("parallel"),                                     
-                       UniqueParallelOp<int64, int32>);                       
-REGISTER_KERNEL_BUILDER(Name("Unique")                     
-                       .TypeConstraint<int32>("T")                            
-                       .TypeConstraint<int32>("out_idx")                      
-                       .Device(DEVICE_CPU)                            
-                       .Label("parallel"), 
-                       UniqueParallelOp<int32, int32>);
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                       .TypeConstraint<int32>("T")
-                       .TypeConstraint<int64>("out_idx")
-                       .Device(DEVICE_CPU)
-                       .Label("parallel"), 
-                       UniqueParallelOp<int32, int64>);
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                       .TypeConstraint<int64>("T")
-                       .TypeConstraint<int64>("out_idx")
-                       .Device(DEVICE_CPU)
-                       .Label("parallel"), 
-                       UniqueParallelOp<int64, int64>);
+/*lint -e665*/
+#define REGISTER_UNIQUE_PARALLEL(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("Unique")                         \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx")  \
+                              .Label("parallel"),                \
+                              UniqueParallelOp<type, int32>);    \
+  REGISTER_KERNEL_BUILDER(Name("Unique")                         \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx")  \
+                              .Label("parallel"),                \
+                              UniqueParallelOp<type, int64>);
+TF_CALL_INTEGRAL_TYPES(REGISTER_UNIQUE_PARALLEL);
+#undef REGISTER_UNIQUE_PARALLEL
+/*lint +e665*/
 }
\ No newline at end of file
diff --git a/tf_adapter/ops/amct_ops.cc b/tf_adapter/ops/amct_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31fffb3f8f2c8784e3cd1b5cd712ef8d80248dec
--- /dev/null
+++ b/tf_adapter/ops/amct_ops.cc
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+REGISTER_OP("AscendQuant")
+    .Attr("T: {float16, float32, float64}")
+    .Attr("quant_bits: int = 8")
+    .Attr("scale: float")
+    .Attr("offset: float")
+    .Input("x: T")
+    .Output("y: T")
+    .SetIsStateful()
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK(); });
+
+REGISTER_OP("AscendWeightQuant")
+    .Attr("T: {float16, float32, float64}")
+    .Input("x: int8")
+    .Input("offset_w: int8")
+    .Output("y: T")
+    .SetIsStateful()
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK(); });
+
+REGISTER_OP("AscendDequant")
+    .Attr("T: {float16, float32, float64}")
+    .Attr("ksize: list(int)")
+    .Attr("data_format: string = 'NHWC'")
+    .Input("x: T")
+    .Input("deq_scale: uint64")
+    .Output("y: T")
+    .SetIsStateful()
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK(); });
+
+REGISTER_OP("AscendAntiQuant")
+    .Attr("T: {float16, float32, float64}")
+    .Attr("scale: float")
+    .Attr("offset: float")
+    .Input("x: T")
+    .Output("y: T")
+    .SetIsStateful()
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK(); });
+}  // namespace tensorflow
diff --git a/tf_adapter/ops/hccl_ops.cc b/tf_adapter/ops/hccl_ops.cc
index ae09c65ea48beef17d593ac33d5244f6bf4829b5..fb6d6f0a650b627e0a1717bb7a92931312b8458c 100644
--- a/tf_adapter/ops/hccl_ops.cc
+++ b/tf_adapter/ops/hccl_ops.cc
@@ -61,7 +61,7 @@ group: all devices of the group participating in this reduction.
 REGISTER_OP("HcomAllGather")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {int8, int16, int32, float16, float32}")
+    .Attr("T: {int8, int16, int32, float16, float32, int64, uint64}")
     .Attr("group: string")
     .Attr("rank_size: int")
     .SetIsStateful()
@@ -154,7 +154,7 @@ REGISTER_OP("HcomReduceScatter")
 
 REGISTER_OP("HcomSend")
     .Input("input: T")
-    .Attr("T: {int8, int16, int32, float16, float32}")
+    .Attr("T: {int8, int16, int32, float16, float32, int64, uint64}")
     .Attr("group: string")
     .Attr("sr_tag: int")
     .Attr("dest_rank: int")
@@ -166,7 +166,7 @@ REGISTER_OP("HcomSend")
 
 REGISTER_OP("HcomReceive")
     .Output("output: T")
-    .Attr("T: {int8, int16, int32, float16, float32}")
+    .Attr("T: {int8, int16, int32, float16, float32, int64, uint64}")
     .Attr("shape: shape")
     .Attr("group: string")
     .Attr("sr_tag: int")
@@ -176,4 +176,30 @@ REGISTER_OP("HcomReceive")
     .Doc(R"doc(
 
 )doc");
+
+REGISTER_OP("HcomRemoteRead")
+    .Input("remote: T")
+    .Output("local: dtype")
+    .Attr("T: {int64, uint64}")
+    .Attr("dtype: {int8, int16, int32, float16, float32, int64, uint64}")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+        c->set_output(0, c->UnknownShape()); // ��һάshapeȷ�����ڶ�άunknown
+        return Status::OK();
+    })
+    .Doc(R"doc(
+
+)doc");
+
+REGISTER_OP("HcomRemoteWrite")
+    .Input("remote: T")
+    .Input("local: dtype")
+    .Attr("T: {int64, uint64}")
+    .Attr("dtype: {int8, int16, int32, float16, float32, int64, uint64}")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+
+)doc");
+
 }  // namespace tensorflow
diff --git a/tf_adapter/ops/npu_aicore_ops.cc b/tf_adapter/ops/npu_aicore_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2023c2456be391afcbf7519ce509639b6408f8b7
--- /dev/null
+++ b/tf_adapter/ops/npu_aicore_ops.cc
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. foss@huawei.com
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+REGISTER_OP("FastGelu")
+    .Input("features: T")
+    .Output("activations: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(tensorflow::shape_inference::UnchangedShape);
+
+REGISTER_OP("FastGeluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Output("backprops: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(tensorflow::shape_inference::MergeBothInputsShapeFn);
+} // namespace tensorflow
diff --git a/tf_adapter/ops/npu_cpu_ops.cc b/tf_adapter/ops/npu_cpu_ops.cc
index 42d6d17c43a0c51098f003785aa3e1519e092dcd..82f79a670792f6b7f3c3e3aff9150a69aef55709 100644
--- a/tf_adapter/ops/npu_cpu_ops.cc
+++ b/tf_adapter/ops/npu_cpu_ops.cc
@@ -36,8 +36,9 @@ using shape_inference::ShapeHandle;
 
 REGISTER_OP("EmbeddingRankId")
     .Input("addr_table: uint64")
-    .Input("index: uint32")
+    .Input("index: T")
     .Output("rank_id: uint64")
+    .Attr("T: {int64,int32,uint64}")
     .Attr("row_memory: int = 320")
     .Attr("mode: string = 'mod' ")
     .SetAllowsUninitializedInput()
diff --git a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc
index c013267ef434af7cf2e9f738ebdde6a5127b640c..f07f91d3a76f2ce896f52b72d56eacf35b8a4fdc 100644
--- a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc
+++ b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tf_adapter/common/common.h"
-#include "tf_adapter/util/generate_report.h"
 #include "tf_adapter/util/infershape_util.h"
 #include "tf_adapter/util/npu_attrs.h"
 #include "tf_adapter/util/npu_ops_identifier.h"
@@ -220,15 +219,10 @@ bool IsWhiteListSupport(const string &op_name, bool mix_compile_mode, const stri
 
   auto identifier = NpuOpsIdentifier::GetInstance(mix_compile_mode);
 
-  bool ans = (identifier->IsNpuSupported(op_name, node_name)) && !EndsWith(op_name, suffix_op)
-      && !EndsWith(op_name, suffix_op_v2) && !(op_name == "Const") && !(op_name == "_Arg") && !(op_name == "_Retval")
-      && !(op_name == "StringJoin");
+  bool ans = (identifier->IsNpuSupported(op_name, node_name)) && !EndsWith(op_name, suffix_op) &&
+             !EndsWith(op_name, suffix_op_v2) && !(op_name == "Const") && !(op_name == "_Arg") &&
+             !(op_name == "_Retval") && !(op_name == "StringJoin");
   if (!ans) {
-    GenerateReport::Details infos;
-    static const std::string message = "This op can only excute on host";
-    infos.code = GenerateReport::NotSupport;
-    infos.message = message;
-    GenerateReport::GetInstance()->AddUnSupportedInfo(node_name, op_name, infos);
     auto ret = not_support_nodes.insert(op_name);
     if (ret.second) {
       LOG(INFO) << "node: " << op_name << " is not in white list, "
@@ -265,25 +259,20 @@ Status SetIteratorShardName(Node *node) {
   return Status::OK();
 }
 
-bool IsWithoutNpuScope(Node *node) {
+bool IsWithoutNpuScope(const NodeDef &node_def) {
   if (!compile_mode) { return false; }
-  bool is_npu_compile = false;
-  Status status = GetNodeAttr(node->attrs(), ATTR_VALUE_SCOPE_NAME, &is_npu_compile);
-  if (status.ok() && is_npu_compile) { return true; }
+  if (node_def.attr().count(ATTR_VALUE_SCOPE_NAME)) { return node_def.attr().at(ATTR_VALUE_SCOPE_NAME).b(); }
   return false;
 }
 
-bool IsWithoutNpuScope(NodeDef &node_def) {
-  if (!compile_mode) { return false; }
-  if (node_def.attr().count(ATTR_VALUE_SCOPE_NAME)) { return node_def.attr().at(ATTR_VALUE_SCOPE_NAME).b(); }
-  return false;
+bool IsWithoutNpuScope(Node *node) {
+  return IsWithoutNpuScope(node->def());
 }
 
 // Make sure we don't recurse infinitely on recursive functions.
 const int kMaxRecursionDepth = 10;
 
 bool IsNpuSupportingFunc(const string &func_name, FunctionLibraryDefinition *func_lib, int depth) {
-  LOG(INFO) << "function name is " << func_name << ", depth is " << depth;
   if (func_lib == nullptr) {
     LOG(ERROR) << "func lib is nullptr, function name is " << func_name;
     return false;
@@ -294,13 +283,12 @@ bool IsNpuSupportingFunc(const string &func_name, FunctionLibraryDefinition *fun
   }
   const FunctionDef *func_def = func_lib->Find(func_name);
   if (func_def == nullptr) {
-    LOG(ERROR) << "func def is nullptr, function name is " << func_name;
     return false;
   }
   for (NodeDef node_def : func_def->node_def()) {
     if (node_def.op() == "Const") {
       LOG(INFO) << "Const in func can dump";
-    } else if (!IsWhiteListSupport(node_def.op(), compile_mode, node_def.name()) || IsWithoutNpuScope(node_def)) {
+    } else if (!IsNpuSupportingNode(node_def, compile_mode, func_lib)) {
       return false;
     }
     for (const auto &item : node_def.attr()) {
@@ -322,6 +310,17 @@ bool IsNpuSupportingFunc(Node *node, FunctionLibraryDefinition *func_lib, int de
   return true;
 }
 
+bool IsNpuSupportingNode(const NodeDef &node_def, bool mix_compile_mode, FunctionLibraryDefinition *func_lib) {
+  if (IsWithoutNpuScope(node_def)) { return false; }
+  if (IsWhiteListSupport(node_def.op(), mix_compile_mode, node_def.name())) { return true; }
+  if (IsNpuSupportingFunc(node_def.op(), func_lib, 0)) { return true; }
+  return false;
+}
+
+bool IsNpuSupportingNode(Node *node, bool mix_compile_mode, FunctionLibraryDefinition *func_lib) {
+  return IsNpuSupportingNode(node->def(), mix_compile_mode, func_lib);
+}
+
 Status FindNpuSupportCandidates(const Graph &graph, OrderedNodeSet *candidates, FunctionLibraryDefinition *func_lib,
                                 bool enableDP, bool mix_compile_mode) {
   int64 startTime = InferShapeUtil::GetCurrentTimestap();
@@ -364,54 +363,27 @@ Status FindNpuSupportCandidates(const Graph &graph, OrderedNodeSet *candidates,
   OrderedNodeSet outSet;
   for (Node *node : sortedNodes) {
     // 0 is function depth
-    if (!IsNpuSupportingFunc(node, func_lib, 0)) {
-      GenerateReport::Details infos;
-      static const std::string message = "This function node is not supported in npu.";
-      infos.code = GenerateReport::NotSupport;
-      infos.message = message;
-      GenerateReport::GetInstance()->AddUnSupportedInfo(node, infos);
-      continue;
-    }
+    if (!IsNpuSupportingFunc(node, func_lib, 0)) { continue; }
     if (!node->IsOp()) {  // Ship Sink/Source nodes.
-      GenerateReport::Details infos;
-      static const std::string message = "Sink/Source is not compute node.";
-      infos.code = GenerateReport::NotSupport;
-      infos.message = message;
-      GenerateReport::GetInstance()->AddUnSupportedInfo(node, infos);
       continue;
     }
     if (enableDP
         && (node->type_string() == "Iterator" || node->type_string() == "IteratorV2"
             || node->type_string() == "IteratorGetNext")) {
-      bool is_sink = false;
       if (node->type_string() == "IteratorGetNext") {
         for (Node *n : node->in_nodes()) {
           REQUIRES_NOT_NULL(n);
           LOG(INFO) << node->name() << " has in nodes " << n->name();
-          if (n->type_string() == "Iterator" || n->type_string() == "IteratorV2") {
-            is_sink = true;
-            candidates->insert(node);
-          }
+          if (n->type_string() == "Iterator" || n->type_string() == "IteratorV2") { candidates->insert(node); }
         }
       }
       if (node->type_string() == "Iterator" || node->type_string() == "IteratorV2") {
         for (Node *n : node->out_nodes()) {
           REQUIRES_NOT_NULL(n);
           LOG(INFO) << node->name() << " has in nodes " << n->name();
-          if (n->type_string() == "IteratorGetNext") {
-            is_sink = true;
-            candidates->insert(node);
-          }
+          if (n->type_string() == "IteratorGetNext") { candidates->insert(node); }
         }
       }
-      if (!is_sink) {
-        GenerateReport::Details infos;
-        static const std::string message =
-            "Only if Iterator/IteratorV2 connect to IteratorGetNext, will them be excuted on npu.";
-        infos.code = GenerateReport::ScenarioProblems;
-        infos.message = message;
-        GenerateReport::GetInstance()->AddUnSupportedInfo(node, infos);
-      }
     } else {
       // Const down when it need down
       if (node->type_string() == "Const") {
@@ -419,31 +391,20 @@ Status FindNpuSupportCandidates(const Graph &graph, OrderedNodeSet *candidates,
         for (auto edge : node->in_edges()) {
           REQUIRES_NOT_NULL(edge);
           REQUIRES_NOT_NULL(edge->src());
-          if (edge->IsControlEdge() && edge->src()->name() != "_SOURCE"
-              && IsWhiteListSupport(edge->src()->type_string(), mix_compile_mode, edge->src()->name())
-              && !IsWithoutNpuScope(edge->src())) {
+          if (edge->IsControlEdge() && edge->src()->name() != "_SOURCE" &&
+              IsNpuSupportingNode(edge->src(), compile_mode, func_lib)) {
             candidates->insert(node);
             ctrlEdgeNum++;
             break;
           }
         }
-        GenerateReport::Details infos;
-        static const std::string message = "This node is not satisfy the needs of Const excuted on npu.";
-        infos.code = GenerateReport::ScenarioProblems;
-        infos.message = message;
-        GenerateReport::GetInstance()->AddUnSupportedInfo(node, infos);
         if (ctrlEdgeNum >= 1) { continue; }
       }
       // normal needed down op
-      if (IsWhiteListSupport(node->type_string(), mix_compile_mode, node->name()) && !IsWithoutNpuScope(node)) {
+      if (IsNpuSupportingNode(node, compile_mode, func_lib)) {
         candidates->insert(node);
       } else {
         outSet.insert(node);
-        GenerateReport::Details infos;
-        static const std::string message = "This node is not supported on npu";
-        infos.code = GenerateReport::NotSupport;
-        infos.message = message;
-        GenerateReport::GetInstance()->AddUnSupportedInfo(node, infos);
       }
     }
   }
@@ -465,11 +426,6 @@ Status FindNpuSupportCandidates(const Graph &graph, OrderedNodeSet *candidates,
       if (unsupportedFrames.find(cfInfo.frame_name) != unsupportedFrames.end()) {
         outSet.insert(*it);
         it = candidates->erase(it);
-        GenerateReport::Details infos;
-        static const std::string message = "This node is will not be excuted on npu in mix_compile_mode";
-        infos.code = GenerateReport::ScenarioProblems;
-        infos.message = message;
-        GenerateReport::GetInstance()->AddUnSupportedInfo(*it, infos);
       } else {
         ++it;
       }
@@ -482,10 +438,7 @@ Status FindNpuSupportCandidates(const Graph &graph, OrderedNodeSet *candidates,
     auto node = *iter;
     if (mix_compile_mode && (node->type_string() == "Where")) {
       bool isInitializedGraph = InferShapeUtil::IsInitializedGraph(node);
-      if (isInitializedGraph) {
-        candidates->insert(node);
-        GenerateReport::GetInstance()->DeleteUnSupportedInfo(node);
-      }
+      if (isInitializedGraph) { candidates->insert(node); }
     }
 
     outSet.erase(iter);
@@ -497,11 +450,6 @@ Status FindNpuSupportCandidates(const Graph &graph, OrderedNodeSet *candidates,
         if (IsRefType(dtypeDst) && candidates->count(edge->dst()) > 0) {
           candidates->erase(edge->dst());
           outSet.insert(edge->dst());
-          GenerateReport::Details infos;
-          static const std::string message = "This node is will not be excuted on npu because of REF input";
-          infos.code = GenerateReport::ScenarioProblems;
-          infos.message = message;
-          GenerateReport::GetInstance()->AddUnSupportedInfo(edge->dst(), infos);
           LOG(INFO) << "Remove node : " << edge->dst()->name() << " from candidates, because of node : " << node->name()
                     << " REF input.";
           continue;
@@ -509,14 +457,7 @@ Status FindNpuSupportCandidates(const Graph &graph, OrderedNodeSet *candidates,
         if (dtypeDst == DT_STRING || dtypeDst == DT_RESOURCE) {
           if (edge->dst()->type_string() == "Assert") { continue; }
           if (node->type_string() == "Const") { continue; }
-          if (candidates->erase(edge->dst()) > 0) {
-            outSet.insert(edge->dst());
-            GenerateReport::Details infos;
-            static const std::string message = "An unsinked node link to this node by DT_STRING/DT_RESOURCE edge";
-            infos.code = GenerateReport::ScenarioProblems;
-            infos.message = message;
-            GenerateReport::GetInstance()->AddUnSupportedInfo(edge->dst(), infos);
-          }
+          if (candidates->erase(edge->dst()) > 0) { outSet.insert(edge->dst()); }
         }
       }
     }
@@ -529,24 +470,12 @@ Status FindNpuSupportCandidates(const Graph &graph, OrderedNodeSet *candidates,
         if (IsRefType(dtypeDst) && candidates->count(edge->src()) > 0) {
           candidates->erase(edge->src());
           outSet.insert(edge->src());
-          GenerateReport::Details infos;
-          static const std::string message = "This node is will not be excuted on npu because of REF output";
-          infos.code = GenerateReport::ScenarioProblems;
-          infos.message = message;
-          GenerateReport::GetInstance()->AddUnSupportedInfo(edge->dst(), infos);
           LOG(INFO) << "Remove node : " << edge->dst()->name() << " from candidates, because of node : " << node->name()
                     << " REF Output.";
           continue;
         }
         if (dtypeDst == DT_STRING || dtypeDst == DT_RESOURCE) {
-          if (candidates->erase(edge->src()) > 0) {
-            outSet.insert(edge->src());
-            GenerateReport::Details infos;
-            static const std::string message = "This node link to an unsinked node by DT_STRING/DT_RESOURCE edge";
-            infos.code = GenerateReport::ScenarioProblems;
-            infos.message = message;
-            GenerateReport::GetInstance()->AddUnSupportedInfo(edge->dst(), infos);
-          }
+          if (candidates->erase(edge->src()) > 0) { outSet.insert(edge->src()); }
         }
       }
     }
@@ -1135,6 +1064,11 @@ class OMSplitter {
     Status SetOptions(std::map<std::string, std::string> npu_optimizer_options,
                       std::map<std::string, std::string> pass_options);
 
+    // GEOp node(s) in the output graph. Not owned.
+    // both point to the function call node.
+    Node *GEOpNodeInputs_;
+    Node *GEOpNodeOutputs_;
+
    private:
     // The subgraph extracted from the input graph, suitable for being turned
     // into a FunctionDef. Inputs are fed by _Arg nodes, and outputs are
@@ -1150,11 +1084,6 @@ class OMSplitter {
     // Name that is used for the GEOp node.
     string functionDefName_;
 
-    // GEOp node(s) in the output graph. Not owned.
-    // both point to the function call node.
-    Node *GEOpNodeInputs_;
-    Node *GEOpNodeOutputs_;
-
     // Maps from source (producer node/slot) and destination
     // (consumer node/slot) tensors in the input graph to _Arg numbers in
     // the subgraph.
@@ -1830,6 +1759,8 @@ Status OMPartitionSubgraphsPass::ProcessGraph(std::unique_ptr<Graph> *graph, Fun
   } else {
     return Status::OK();
   }
+
+  LOG(INFO) << "OMPartition subgraph_" << std::to_string(graph_num) << " begin.";
   LOG(INFO) << "mix_compile_mode is " << (mix_compile_mode ? "True" : "False");
   LOG(INFO) << "iterations_per_loop is " << iterations_per_loop;
 
@@ -2009,7 +1940,7 @@ Status OMPartitionSubgraphsPass::ProcessGraph(std::unique_ptr<Graph> *graph, Fun
   }
   TF_RETURN_IF_ERROR(OMSplitter::OMPartitionSubgraphsInFunctions(
       OMSplitter::PARTITION_SUB_GRAPH_ATTR, graph, graph_format_value, func_lib, all_options, pass_options));
-  LOG(INFO) << "OMPartition subgraph_" << std::to_string(graph_num) << "SubgraphsInFunctions success.";
+  LOG(INFO) << "OMPartition subgraph_" << std::to_string(graph_num) << " SubgraphsInFunctions success.";
   FixupSourceAndSinkEdges(graph->get());
 
   if (need_print != nullptr && strcmp("1", need_print) == 0) {
diff --git a/tf_adapter/optimizers/om_partition_subgraphs_pass.h b/tf_adapter/optimizers/om_partition_subgraphs_pass.h
index 890592028e637c085f06c4dbe980ebbd46f9073a..ca4e55b4caa7e80ac11e7a008737f3157e74d010 100644
--- a/tf_adapter/optimizers/om_partition_subgraphs_pass.h
+++ b/tf_adapter/optimizers/om_partition_subgraphs_pass.h
@@ -46,6 +46,9 @@ Status MarkForPartition(const GraphOptimizationPassOptions &options, int &cluste
 // functions to introduce.
 Status OMPartitionSubgraphsInFunctions(string groupAttribute, const GraphOptimizationPassOptions &options,
                                        string graph_format);
+
+bool IsNpuSupportingNode(const NodeDef &node_def, bool mix_compile_mode, FunctionLibraryDefinition *func_lib);
+bool IsNpuSupportingNode(Node *node, bool mix_compile_mode, FunctionLibraryDefinition *func_lib);
 }  // namespace OMSplitter
 
 class OMPartitionSubgraphsPass : public GraphOptimizationPass {
diff --git a/tf_adapter/python/npu_bridge/__init__.py b/tf_adapter/python/npu_bridge/__init__.py
index c3b77fe3540a03e3da26c9a1665300aee3988e60..d6c2996b3028cb67ede8ec6e2cee6454b699db9e 100644
--- a/tf_adapter/python/npu_bridge/__init__.py
+++ b/tf_adapter/python/npu_bridge/__init__.py
@@ -1,20 +1,6 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 from npu_bridge.helper.helper import npu_bridge_handle
 from npu_bridge.helper.helper import version as __version__
 from npu_bridge.helper import helper
 from npu_bridge.estimator.npu import npu_estimator
 from npu_bridge.hccl import hccl_ops
-__all__ = [_s for _s in dir() if not _s.startswith('_')]
+__all__ = [_s for _s in dir() if not _s.startswith('_')]
\ No newline at end of file
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/keras_to_npu.py b/tf_adapter/python/npu_bridge/estimator/npu/keras_to_npu.py
index 8a139a4108fb6a243b0c5a4597db24922bfd39be..9bafe38bca52f028dbd43f3eba13b6bace6c79d4 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/keras_to_npu.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/keras_to_npu.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/mnist_softmax_npu.py b/tf_adapter/python/npu_bridge/estimator/npu/mnist_softmax_npu.py
index dc28cdb709dd24436926ff29b00f39fad9e23d04..e30f42f81c4f265a8869d1effba5df5898782cb9 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/mnist_softmax_npu.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/mnist_softmax_npu.py
@@ -1,17 +1,4 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
+
 """Simple MNIST classifier example with npu and timelines.
 
   Note: Please see further comments in the document.
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/mnist_with_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/mnist_with_estimator.py
index 6b87c0a5d767eed57c4847f375ddfac7edc5fc5f..fd13a7163156fee1ba99c2efdcc1ded087b10cc0 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/mnist_with_estimator.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/mnist_with_estimator.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py
index a4da6a612404d1cf5a12023a4110015dc61f3fe7..1ffb1a96104566ee2736829728dd4586b60623f5 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 """Inter-process communication using HCOM."""
 
 from __future__ import absolute_import
@@ -172,12 +158,6 @@ class NPUBasics(object):
                 index = device_info.get('Index', None)
                 util_lib.check_nonnegative_integer(index, 'Index')
 
-                dev_index = device_info.get('dev_index', None)
-                util_lib.check_nonnegative_integer(dev_index, 'dev_index')
-
-                server_id = device_info.get('server_id', None)
-                util_lib.check_not_none(device_info, 'server_id')
-
                 # 2. Get the rank_table_file and check it.
                 rank_table_file = data.get('rank_table_file', None)
                 util_lib.check_not_none(rank_table_file, 'rank_table_file')
@@ -190,7 +170,7 @@ class NPUBasics(object):
                 local_checkpoint_dir = data.get('local_checkpoint_dir', None)
 
                 # 5. Init the JobInfo.
-                device_info = DeviceInfo(index=str(index), server_id=server_id, dev_index=dev_index)
+                device_info = DeviceInfo(index=str(index))
                 job_info = JobInfo(device_info=device_info, rank_table_file=rank_table_file,
                                    local_checkpoint_dir=local_checkpoint_dir, rank_size=rank_size)
                 return job_info
@@ -210,12 +190,6 @@ class NPUBasics(object):
             if(identity == ""):
                 identity = os.getenv('RANK_ID', "")
 
-            dev_index = os.getenv('DEVICE_ID')
-            if dev_index != None and dev_index.isdigit() and int(dev_index) <=7 and int(dev_index) >= 0:
-                dev_index = int(dev_index)
-            else:
-                raise RuntimeError("DEVICE_ID environment variable should in [0, 7]")
-
             checkpoint_dir = os.getenv('LOCAL_CHECKPOINT_DIR', "")
 
             # cann't get rank_size from env, set to default 1
@@ -224,7 +198,7 @@ class NPUBasics(object):
                 print("set rank_size to default 1")
                 rank_size = 1
 
-            device_info = DeviceInfo(index=str(identity), server_id="192.168.1.1", dev_index=int(dev_index))
+            device_info = DeviceInfo(index=str(identity))
             job_info = JobInfo(job_id=job_id,
                                heartbeat_time=heartbeat,
                                device_info=device_info,
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
index d3784439e11e57d0b5a925b3d55d1965a3643316..a411be885927435506e16a732cb08db8443c1c22 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -21,6 +7,7 @@ from tensorflow.python.platform import gfile
 import os
 import re
 import json
+from enum import Enum
 from npu_bridge.estimator.npu import util
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.distribute.experimental import ParameterServerStrategy
@@ -57,10 +44,16 @@ class NPURunConfig(run_config_lib.RunConfig):
                  dump_config=None,
                  stream_max_parallel_num=None,
                  is_tailing_optimization=False,
-                 horovod_mode = False,
-                 graph_run_mode = 1,
-                 op_debug_level = 0,
-                 enable_scope_fusion_passes = None
+                 horovod_mode=False,
+                 graph_run_mode=1,
+                 op_debug_level=0,
+                 enable_scope_fusion_passes=None,
+                 enable_exception_dump=0,
+                 op_select_implmode=None,
+                 optypelist_for_implmode=None,
+                 dynamic_input_config=None,
+                 mstune_mode=None,
+                 work_path=None
                  ):
         """
         Constructs a NPUConfig.
@@ -95,7 +88,7 @@ class NPURunConfig(run_config_lib.RunConfig):
         log_step_count_steps: The frequency, in number of global steps, that the
             global step/sec and the loss will be logged during training.
         enabel_data_pre_proc: This is the switch of data preprocess.
-        precision_mode: enable or disable mix precision.
+        precision_mode: if train, default is: allow_fp32_to_fp16; if inference, default is: force_fp16.
         variable_format_optimize: enable or disable variable format optimize while graph
             engineer optimize process.
         mix_compile_mode: This is the swith of mix_compile_mode. When the value is
@@ -109,6 +102,13 @@ class NPURunConfig(run_config_lib.RunConfig):
         dump_config: The dump configuration.
         stream_max_parallel_num: Specify the degree of parallelism of the AICPU / AICORE engine
                                  to achieve parallel execution between AICPU / AICORE operators.
+        op_select_implmode: Selecting whether the operator is implemented with high precision
+                            or high performance.
+        optypelist_for_implmode: Operator list.
+        dynamic_input_config:Dynamic dims configuration
+        mstune_mode: Optimization Task Type."1": model tune; "2": optune;
+                     "3": model tune & optune; "4": gradient split tune.
+        work_path: Stores temporary files generated during optimization.
         """
 
         # Check iterations_per_loop.
@@ -164,6 +164,15 @@ class NPURunConfig(run_config_lib.RunConfig):
         experimental_distribute = None
         if tmp_cluster_spec and isinstance(distribute, ParameterServerStrategy):
             experimental_distribute = DistributeConfig(distribute, distribute, None)
+        util.check_nonnegative_integer(enable_exception_dump, "enable_exception_dump")
+        self.enable_exception_dump = enable_exception_dump
+        self._op_select_implmode = op_select_implmode
+        self._optypelist_for_implmode = optypelist_for_implmode
+        if dynamic_input_config is not None and  not isinstance(dynamic_input_config, DynamicInputConfig):
+            raise ValueError('dynamic_input_config must be provided with type DynamicInputConfig')
+        self._dynamic_input_config = dynamic_input_config
+        self._mstune_mode = mstune_mode
+        self._work_path = work_path
 
         super(NPURunConfig, self).__init__(
             model_dir=model_dir,
@@ -182,18 +191,23 @@ class ProfilingConfig():
 
     def __init__(self,
                  enable_profiling=False,
-                 enable_options=[]):
+                 enable_options=[],
+                 fp_point=None,
+                 bp_point=None):
         """
         Constructs a ProfilingConfig.
         Args:
             enable_profiling: Enable profiling, default is False.
             enable_options: Profiling options, list of `training_trace` or `task_trace`
                 or `op_trace`.
+            fp_point: Forward propagation first node name.
+            bp_point: back propagation last node name.
         """
 
         self._enable_profiling = enable_profiling
         self._enable_options = enable_options
-
+        self._fp_point = fp_point
+        self._bp_point = bp_point
 
 class DumpConfig():
     """Dump Config with NPU support."""
@@ -221,3 +235,26 @@ class DumpConfig():
         self._dump_mode = dump_mode
         self._enable_dump_debug = enable_dump_debug
         self._dump_debug_mode = dump_debug_mode
+
+class NpuExecutePlacement(Enum):
+    """npu execute place option. """
+    ALL = "all"
+    CUBE = "cube"
+    VECTOR = "vector"
+    TAISHAN = "taishan"
+    DVPP = "dvpp"
+    HOST = "host"
+
+class DynamicInputConfig():
+    """dynamic dims and input shape config with npu support"""
+    def __init__(self, input_shape, dynamic_dims):
+        """
+        Constructs a DynamicInputConfig.
+
+        Args:
+            input_shape: the network's inputs shapes.
+            dynamic_dims: This parameter corresponds to input_shape.
+                          The dim value in dims corresponds to the parameter "-1" in input_shape.
+        """
+        self._input_shape = input_shape
+        self._dynamic_dims = dynamic_dims
\ No newline at end of file
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
index fb66d8b5ac4896b217d19525b12864d8d09679ca..9480741c0451999fbaf019b5e4f6b33c1fc511dd 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -566,6 +552,12 @@ class NPUEstimator(estimator_lib.Estimator):
                 # check profiling ,and get valid options
                 profiling_options = self.__check_profiling_options(config._profiling_config._enable_options)
                 custom_op.parameter_map["profiling_options"].s = tf.compat.as_bytes(profiling_options)
+                if "task_trace" in profiling_options or  "training_trace" in profiling_options:
+                    if config._profiling_config._fp_point is None or config._profiling_config._bp_point is None:
+                        logging.warning("profiling training_trace option should use with bp_point and fp_point")
+                    else:
+                        custom_op.parameter_map["bp_point"].s = tf.compat.as_bytes(config._profiling_config._bp_point)
+                        custom_op.parameter_map["fp_point"].s = tf.compat.as_bytes(config._profiling_config._fp_point)
             else:
                 # User disable profiling,
                 custom_op.parameter_map["profiling_mode"].b = False
@@ -580,6 +572,12 @@ class NPUEstimator(estimator_lib.Estimator):
         """
         if config._precision_mode is not None:
             custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes(config._precision_mode)
+        else:
+            if config.graph_run_mode:
+                custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_fp32_to_fp16")
+            else:
+                custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("force_fp16")
+
         custom_op.parameter_map["enable_reduce_precision"].b = config._enable_reduce_precision
 
     def __load__variable_format_optimize(self, config, custom_op):
@@ -656,6 +654,43 @@ class NPUEstimator(estimator_lib.Estimator):
             custom_op.parameter_map["job"].s = tf.compat.as_bytes('localhost')
             custom_op.parameter_map["task_index"].i = 0
 
+    def _load_op_performance_config(self, config, custom_op):
+        """Load _load_op_performance_config ,and add to custom_optimizers
+        Args:
+            config: NPURunConfig.
+            custom_op: Customer optimizers.
+        """
+        if config._op_select_implmode is not None:
+            custom_op.parameter_map["op_select_implmode"].s = tf.compat.as_bytes(config._op_select_implmode)
+        if config._optypelist_for_implmode is not None:
+            custom_op.parameter_map["optypelist_for_implmode"].s = tf.compat.as_bytes(config._optypelist_for_implmode)
+
+    def __load_dynamic_input_config(self, config, custom_op):
+        """Load dynamic input config,and add to custom_optimizers
+        Args:
+            config: NPURunConfig.
+            custom_op: Customer optimizers.
+        """
+
+        if config._dynamic_input_config is not None \
+           and config._dynamic_input_config._input_shape is not None \
+           and config._dynamic_input_config._dynamic_dims is not None:
+            custom_op.parameter_map["input_shape"].s = tf.compat.as_bytes(config._dynamic_input_config._input_shape)
+            custom_op.parameter_map["dynamic_dims"].s = tf.compat.as_bytes(config._dynamic_input_config._dynamic_dims)
+
+    def __load_mstune_config(self, config, custom_op):
+        """Load mstune config ,and add to custom_optimizers
+        Args:
+            config: NPURunConfig.
+            custom_op: Customer optimizers.
+        """
+        if config._mstune_mode is not None:
+            custom_op.parameter_map["mstune_mode"].s = tf.compat.as_bytes(config._mstune_mode)
+            if config._work_path is not None:
+                custom_op.parameter_map["work_path"].s = tf.compat.as_bytes(config._work_path)
+            else:
+                raise ValueError('work_path must be set when use mstune_mode')
+
     def __load_graph_optimizers(self, config):
         """Change the session config and load the graph optimizers:
         GradFusionOptimizer and OMPartitionSubgraphsPass."""
@@ -691,6 +726,7 @@ class NPUEstimator(estimator_lib.Estimator):
         custom_op.parameter_map["op_debug_level"].i = config.op_debug_level
         if config.enable_scope_fusion_passes is not None:
             custom_op.parameter_map["enable_scope_fusion_passes"].s = tf.compat.as_bytes(config.enable_scope_fusion_passes)
+        custom_op.parameter_map["enable_exception_dump"].i = config.enable_exception_dump
 
         # add profiling options to custom_op
         self.__load_profiling_options(config, custom_op)
@@ -712,6 +748,13 @@ class NPUEstimator(estimator_lib.Estimator):
 
         self.__load_ps_mode_config(config, custom_op)
 
+        self._load_op_performance_config(config, custom_op)
+
+        # add dynamic_input_config to custom_op
+        self.__load_dynamic_input_config(config, custom_op)
+
+        self.__load_mstune_config(config, custom_op)
+
         return config
 
 
@@ -754,4 +797,4 @@ class NPUEstimator(estimator_lib.Estimator):
                     break
             logging.warning('Using temporary folder as model directory: %s', model_dir)
             tf.io.gfile.mkdir(model_dir)
-        return model_dir
\ No newline at end of file
+        return model_dir
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py
index 881532c2fc915a2175bf46706dcabeb2669e35a2..f1fc8bb697af5790fa4bb312a1d54524d2c2d26c 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 import tensorflow as tf
 from six.moves import queue as Queue
 import threading
@@ -159,12 +145,11 @@ def broadcast_global_variables(root_rank, index):
     op_list = []
     for var in tf.global_variables():
         # the input and out tensor of HCOMBroadcast interface are list
-        if "float" in var.dtype.name:
-            inputs = [var]
-            outputs=hccl_ops.broadcast(tensor=inputs,root_rank=root_rank)
-            if outputs is not None:
-                op_list.append(outputs[0].op)
-                op_list.append(tf.assign(var, outputs[0]))
+        inputs = [var]
+        outputs=hccl_ops.broadcast(tensor=inputs,root_rank=root_rank)
+        if outputs is not None:
+            op_list.append(outputs[0].op)
+            op_list.append(tf.assign(var, outputs[0]))
 
     return tf.group(op_list)
 
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_manager.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_manager.py
index 75a3efdda170afc315436d0304dda2dd21633a6b..4f5f46cb4803781a75bca0abf0e09062caeb12ab 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_manager.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_manager.py
@@ -12,20 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-#
-# Copyright 2019-2020 Huawei Technologies Co., Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """LossScaleManager classes for mixed precision training."""
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py
index 9a23778da231ee6c3f5d0a8860160e12280f44cb..0356f7f8f30cd167694e60880aef12da5cd6742e 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py
@@ -1,17 +1,4 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
+
 # Optimizer for mixed precision training for Davinci NPU.
 
 """Loss scaling optimizer."""
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py
index 23ea950bf1e4abaa1171b1b1272cd2269aaa1aa5..8e89daca98a451e20286c5a71dd6bc86734ea0c3 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 """
 Optimizer that implements distributed gradient reduction for NPU.
 """
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py
index c2bb1ae22d42f709b5151333067f7840122d6214..37dc689ecfee8d9125d85c836392b9f62f4ce56a 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py
@@ -1,19 +1,8 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
+from hccl.manage.api import get_local_rank_size
+from hccl.manage.api import get_rank_id
 from npu_bridge import tf_adapter
 from npu_bridge.estimator.npu import util
+from tensorflow.python.platform import tf_logging as logging
 import json
 import os
 
@@ -24,37 +13,44 @@ __option_exec_profiling_mode = str(tf_adapter.OPTION_EXEC_PROFILING_MODE)
 __option_exec_profiling_options = str(tf_adapter.OPTION_EXEC_PROFILING_OPTIONS)
 __option_graph_run_mode = str(tf_adapter.OPTION_GRAPH_RUN_MODE)
 __option_exec_option_exec_hccl_flag = str(tf_adapter.OPTION_EXEC_HCCL_FLAG)
+__option_exec_fp_point = str(tf_adapter.OPTION_EXEC_PROFILING_FPPONIT_OPTIONS)
+__option_exec_bp_point = str(tf_adapter.OPTION_EXEC_PROFILING_BPPONIT_OPTIONS)
 
-def npu_global_init(graph_run_mode = 1,
-                    op_debug_level = 0,
-                    is_tailing_optimization = False,
-                    enable_profiling = False,
-                    enable_options = "training_trace",
-                    auto_tune_mode = None,
-                    precision_mode = None,
-                    enable_scope_fusion_passes = None,
-                    ):
+def npu_resource_init(graph_run_mode = 1,
+                      op_debug_level = 0,
+                      enable_profiling = False,
+                      enable_options = ["training_trace"],
+                      auto_tune_mode = None,
+                      precision_mode = None,
+                      enable_scope_fusion_passes = None,
+                      enable_exception_dump = 0,
+                      fp_point = None,
+                      bp_point = None,
+                      mstune_mode = None,
+                      work_path = None):
 
     util.check_nonnegative_integer(graph_run_mode, "graph_run_mode")
     if graph_run_mode > 1:
         raise ValueError('"graph_run_mode" value must be 0 or 1')
-
+    util.check_nonnegative_integer(enable_exception_dump, "enable_exception_dump")
     util.check_nonnegative_integer(op_debug_level, "op_debug_level")
-    util.check_bool_type(is_tailing_optimization, "is_tailing_optimization")
     util.check_bool_type(enable_profiling, "enable_profiling")
-    graph_run_mode = str(graph_run_mode)
-    op_debug_level = str(op_debug_level)
-    is_tailing_optimization = str(util.convert_bool_to_int(is_tailing_optimization))
-    enable_profiling = str(util.convert_bool_to_int(enable_profiling))
+    enable_profiling = util.convert_bool_to_int(enable_profiling)
 
     init={}
-    init[__option_graph_run_mode] = graph_run_mode
-    init[__op_debug_level] = op_debug_level
-    init["ge.exec.isTailingOptimization"] = is_tailing_optimization
-    init[__option_exec_profiling_mode] = enable_profiling
+    init[__option_graph_run_mode] = str(graph_run_mode)
+    init[__op_debug_level] = str(op_debug_level)
+    init[__option_exec_profiling_mode] = str(enable_profiling)
 
-    if enable_profiling is True:
-        init[__option_exec_profiling_options] = str(util.check_profiling_options(enable_options))
+    if enable_profiling:
+        enable_options = str(util.check_profiling_options(enable_options))
+        init[__option_exec_profiling_options] = enable_options
+        if "task_trace" in enable_options or "training_trace" in enable_options:
+          if fp_point is None or bp_point is None:
+            logging.warning("profiling training_trace option should use with bp_point and fp_point")
+          else:
+            init[__option_exec_fp_point] = str(fp_point)
+            init[__option_exec_bp_point] = str(bp_point)
     else:
         init[__option_exec_profiling_options] = str("training_trace")
 
@@ -63,20 +59,70 @@ def npu_global_init(graph_run_mode = 1,
 
     if precision_mode is not None:
         init["ge.exec.precision_mode"] = str(precision_mode)
+    else:
+        if graph_run_mode:
+            init["ge.exec.precision_mode"] = str("allow_fp32_to_fp16")
+        else:
+            init["ge.exec.precision_mode"] = str("force_fp16")
 
     if enable_scope_fusion_passes is not None:
         init[__option_exec_enable_scope_fusion_passes] = str(enable_scope_fusion_passes)
 
-    config_info = json.loads(os.environ.get('TF_CONFIG') or '{}')
-    task_env = config_info.get('task', {})
-    task_type = task_env.get('type', None)
-    exec_hccl_flag = 1
-    if task_type == 'evaluator':
-        exec_hccl_flag = 0
-    init[__option_exec_option_exec_hccl_flag] = str(exec_hccl_flag)
+    init["ge.exec.enable_exception_dump"] = str(enable_exception_dump)
+    rank_size = os.getenv('RANK_SIZE')
+    if int(rank_size) > 1 and mstune_mode is not None:
+      mstune_mode = util.check_mstune_mode(mstune_mode)
+      if mstune_mode == "4":
+        mstune_mode = "tuning"
+      init["ge.buildMode"] = str(mstune_mode)
+      if work_path is not None:
+        init["ge.tuningPath"] = str(util.check_path(work_path))
+      else:
+        raise ValueError('work_path must be set when use mstune_mode')
 
     init_options=tf_adapter.map_string_string(init)
     tf_adapter.PluginInit(init_options)
 
-def npu_global_shutdown():
+def npu_resource_shutdown():
     tf_adapter.PluginFinalize()
+
+def rdma_remote_init(remote_var_list, mem_size):
+  '''
+  remote_var_list: embedding and opt var list.
+  mem_size: ramd pool memory size to be allocated. type:int
+  '''
+  if not isinstance(remote_var_list, (tuple, list)):
+    raise ValueError('{} should be tuple or list'.format(remote_var_list))
+  if not isinstance(mem_size, int):
+    raise ValueError('{} should be int'.format(mem_size))
+  var_addr_list = []
+  local_rank_size = get_local_rank_size()
+  rank_id = get_rank_id()
+  server_id = int(rank_id / local_rank_size)
+  for var in remote_var_list:
+    server_var = var[server_id]
+    host_var_info = tf_adapter.HostVarInfo()
+    host_var_info.base_addr = server_var[1]
+    host_var_info.var_size = server_var[2]
+    var_addr_list.append(host_var_info)
+  res = tf_adapter.RdmaInitAndRegister(var_addr_list, mem_size)
+  if res != 0:
+    raise RuntimeError('rdma init and register failed')
+
+def get_var_addr_and_size(var_name):
+  if not isinstance(var_name, str):
+    raise ValueError('{} should be str'.format(var_name))
+  res = tf_adapter.GetVarAddrAndSize(var_name)
+  if res[0] != 0:
+    raise RuntimeError('{} get var addr and size failed'.format(var_name))
+  return res[1], res[2]
+
+def malloc_shared_memory(var_name, shape, data_type):
+  tensor_info = tf_adapter.TensorInfo()
+  tensor_info.var_name = var_name
+  tensor_info.dims = tf_adapter.int64_vec(shape)
+  tensor_info.data_type = data_type
+  res = tf_adapter.MallocSharedMem(tensor_info)
+  if res[0] != 0:
+    raise RuntimeError('{} malloc shared memory failed'.format(var_name))
+  return res[1], res[2]
\ No newline at end of file
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_rnn.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_rnn.py
index dae7d36a81c5f8218892601b9d4fa608737dbda0..dbd60742070c635dc2b343b67cafe6daac0161e7 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_rnn.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_rnn.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 import tensorflow as tf
 
 def npu_dynamic_rnn(cell,
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py
index 9747ce7a8e7811d718784b65ec75d3184c32322b..72702e146f32e0bc4a1f3931e186b4a28ce30399 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py
@@ -1,32 +1,34 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 """
 Config the non npu compilation scope for NPU in mix compute mode.
 """
 import contextlib
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import ops
+from tensorflow.python.util import compat
+from npu_bridge.estimator.npu.npu_config import NpuExecutePlacement
 
 @contextlib.contextmanager
 def without_npu_compile_scope():
-    '''
-    Enable the non npu compilation of operators within the scope.
-    '''
-    attrs = {
-        "_without_npu_compile" : attr_value_pb2.AttrValue(b=True)
-    }
+  '''
+  Enable the non npu compilation of operators within the scope.
+  '''
+  attrs = {
+    "_without_npu_compile" : attr_value_pb2.AttrValue(b=True)
+  }
 
-    with ops.get_default_graph()._attr_scope(attrs):
-        yield
+  with ops.get_default_graph()._attr_scope(attrs):
+    yield
+
+@contextlib.contextmanager
+def npu_variable_scope(placement=NpuExecutePlacement.ALL):
+  '''
+  Enable the node in the scope adding _variable_placement attr.
+  '''
+  if placement not in NpuExecutePlacement:
+    raise ValueError("placement vaule must be in NpuExecutePlacement's vaule")
+
+  attrs = {
+    "_variable_placement" : attr_value_pb2.AttrValue(s=compat.as_bytes(placement.value))
+  }
+  with ops.get_default_graph()._attr_scope(attrs):
+    yield
\ No newline at end of file
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/util.py b/tf_adapter/python/npu_bridge/estimator/npu/util.py
index a00df761b93116f28f949ca0ba2624f2ad477e84..73ceaa639e8bebe1d9cbf1b18f5aa0e3e5627948 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/util.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/util.py
@@ -1,22 +1,9 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
+import os
 import tensorflow as tf
 from tensorflow.python.framework import ops
 from tensorflow.python.training import training_util
@@ -86,7 +73,7 @@ def format_string(value, name):
 
   return str(value)
 
-def check_profiling_options(self, profiling_options=[]):
+def check_profiling_options(profiling_options=[]):
   """Check profiling options .
   Args:
       profiling_options: Profiling options.
@@ -109,6 +96,40 @@ def check_profiling_options(self, profiling_options=[]):
   result = ":".join(profiling_options)
   return result
 
+def check_path(path):
+  """Check path.
+  Args:
+      path: path.
+  Return:
+      real path
+  Raise:
+      if path is valid or not read and write permissions.
+  """
+  if os.path.exists(path):
+    real_path = os.path.realpath(path)
+    if not os.path.isdir(real_path):
+      raise ValueError("path:%s is not directory." %(path))
+    if not os.access(real_path, os.R_OK | os.W_OK):
+      raise ValueError("path:%s is not read and write permissions." %(path))
+  else:
+    raise ValueError("path:%s is not exists." %(path))
+  return real_path
+
+def check_mstune_mode(mstune_mode):
+  """Check mstune mode .
+  Args:
+      mstune_mode: mstune_mode: Optimization Task Type."1": model tune; "2": optune;
+                                "3": model tune & optune; "4": gradient split tune.
+  Return:
+      mstune_mode
+  Raise:
+      If mstune_mode is null or not in ['1', '2', '3', '4'].
+  """
+  mstune_modes = ['1', '2', '3', '4']
+  if mstune_mode not in mstune_modes:
+    raise ValueError("mstune_mode is valid, should be in ['1', '2', '3', '4']")
+  return mstune_mode
+
 def register_func(var_name):
   ops.register_proto_function(
     '{}_{}'.format(_NPU_RUNCONFIG, var_name),
@@ -217,4 +238,16 @@ class IterationPerLoop():
     self._iterations_per_loop_var.load(iterations_per_loop - 1, session=sess)
     self._loop_cond_var.load(0, session=sess)
     self._const_zero.load(0, session=sess)
-    self._const_one.load(1, session=sess)
\ No newline at end of file
+    self._const_one.load(1, session=sess)
+
+def variable_initializer_in_host(var_list):
+  """Returns an Op that initializes a list of variables.
+  If `var_list` is empty, however, the function still returns an Op that can
+  be run. That Op just has no effect.
+  Args:
+    var_list: List of `Variable` objects to initialize.
+    name: Optional name for the returned operation.
+  Returns:
+    An Op that run the initializers of all the specified variables.
+  """
+  return tf.initializers.variables(var_list, name='var_in_host')
\ No newline at end of file
diff --git a/tf_adapter/python/npu_bridge/estimator/npu_aicore_ops.py b/tf_adapter/python/npu_bridge/estimator/npu_aicore_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab1a6372a2164be27c61377bd54df003d2898917
--- /dev/null
+++ b/tf_adapter/python/npu_bridge/estimator/npu_aicore_ops.py
@@ -0,0 +1,46 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""All bert ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.framework import ops
+
+
+from npu_bridge.helper import helper
+npu_aicore_ops = helper.get_gen_ops();
+
+@ops.RegisterGradient("FastGelu")
+def _fast_gelu_grad(op, grad):
+  """The gradient for `fast_gelu`.
+
+  Args:
+      op: The `fast_gelu` `Operation` that we are differentiating, which we can use
+          to find the inputs and outputs of the original op.
+      grad: Gradient with respect to the output of the `fast_gelu` op.
+
+  Returns:
+      Gradients with respect to the input of `fast_gelu`.
+  """
+  return [npu_aicore_ops.fast_gelu_grad(grad, op.inputs[0])]  # List of one Tensor, since we have one input
+
+# go/tf-wildcard-import
+#from tensorflow.python.util.tf_export import tf_export
+
diff --git a/tf_adapter/python/npu_bridge/estimator/npu_ops.py b/tf_adapter/python/npu_bridge/estimator/npu_ops.py
index 83f3697c97ec896bbc2459a1d88bf23287cc349e..ab88bf56c4b1af5617839f3bdb6e4ec82030fc57 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu_ops.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu_ops.py
@@ -12,20 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-#
-# Copyright 2019-2020 Huawei Technologies Co., Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Ops for collective operations implemented using hccl."""
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tf_adapter/python/npu_bridge/estimator/npu_unary_ops.py b/tf_adapter/python/npu_bridge/estimator/npu_unary_ops.py
index dd8b2e3cd923fec2796e58480f73ab8245f20fe8..a9b6733067146b6aeb844747025567f1d7c55583 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu_unary_ops.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu_unary_ops.py
@@ -12,20 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-#
-# Copyright 2019-2020 Huawei Technologies Co., Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 """All bert ops."""
 
diff --git a/tf_adapter/python/npu_bridge/hccl/hccl_ops.py b/tf_adapter/python/npu_bridge/hccl/hccl_ops.py
index a25d1c69134f48f0d45dc6ad3ac2d82e6df99259..970da13c3350259ba6530c7bce0bc0a37ff9afcc 100644
--- a/tf_adapter/python/npu_bridge/hccl/hccl_ops.py
+++ b/tf_adapter/python/npu_bridge/hccl/hccl_ops.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 ## @file hccl_ops.py
 # HCCL 算子API
 
@@ -115,3 +101,22 @@ def receive(shape, data_type, sr_tag, src_rank, group="hccl_world_group"):
         sr_tag=sr_tag,
         src_rank=src_rank)
     return result
+
+## 提供remote read功能
+#  @param remote 远端内存信息，shape(index_num, 3)：[u64 remoteId, u64 remoteAddr, u64 dataLength]
+#  @param data_type 接收tensor的数据类型
+#  @return 本端接收内存 shape(index_num, dataLength/sizeof(data_type))
+def remote_read(tensorRemote, data_type):
+    result = gen_hccl_ops.hcom_remote_read(
+        remote=tensorRemote,
+        dtype=data_type)
+    return result
+
+## 提供remote write功能
+#  @param remote 写入远端内存信息，shape(index_num, 3)：[u64 remoteId, u64 remoteAddr, u64 dataLength]
+#  @param local 本端发送内存
+def remote_write(tensorRemote, tensorLocal, data_type):
+    result = gen_hccl_ops.hcom_remote_write(
+        remote=tensorRemote,
+        local=tensorLocal)
+    return result
\ No newline at end of file
diff --git a/tf_adapter/python/npu_bridge/helper/helper.py b/tf_adapter/python/npu_bridge/helper/helper.py
index 66dd0e27f748d32f1965759e4922768ca36b8467..64774fbd27ef56f51119ffff2c37b489631c9a15 100644
--- a/tf_adapter/python/npu_bridge/helper/helper.py
+++ b/tf_adapter/python/npu_bridge/helper/helper.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 import tensorflow
 import npu_bridge
 import os
diff --git a/tf_adapter/python/npu_bridge/image/image_ops.py b/tf_adapter/python/npu_bridge/image/image_ops.py
index 8f3d8f372a4df35fad68a0185c770cd862a23b96..ecb7c6ccd786699fca46f046ee109d6dfbf3780f 100644
--- a/tf_adapter/python/npu_bridge/image/image_ops.py
+++ b/tf_adapter/python/npu_bridge/image/image_ops.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import load_library
 from tensorflow.python.platform import resource_loader
diff --git a/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py b/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py
index ae7c199d550d671584837bc154c108f0834065a4..391241c46a0cb836a98ee1367e7044e5db070c3d 100644
--- a/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py
+++ b/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py
@@ -1,17 +1,3 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import load_library
 from tensorflow.python.framework import ops
diff --git a/tf_adapter/python/setup.py b/tf_adapter/python/setup.py
index 82fb00bd498ac6463683d71d05bd7ba2a8a3f88f..6c76feae7c09421843fd1edbe2cc0974fe2d0c49 100644
--- a/tf_adapter/python/setup.py
+++ b/tf_adapter/python/setup.py
@@ -12,20 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-#
-# Copyright 2019-2020 Huawei Technologies Co., Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """npu bridge for tensorflow v1.15.0.
 
 
diff --git a/tf_adapter/swig/ge_plugin.i b/tf_adapter/swig/ge_plugin.i
index 02931535c784bf10a2c3178dcb1c489a93f775c1..20cdee606840614fc5f46c45abcd3659d75709a8 100644
--- a/tf_adapter/swig/ge_plugin.i
+++ b/tf_adapter/swig/ge_plugin.i
@@ -1,15 +1,51 @@
 %module tf_adapter
 %include "std_string.i"
 %include "std_map.i"
+%include "typemaps.i"
+%include "std_vector.i"
+
+%begin %{
+#define SWIG_PYTHON_CAST_MODE
+%}
+
+typedef int int32_t;
+%apply int { int32_t };
+
+#if defined(SMALL_LONG)
+
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+%apply long long { int64_t };
+%apply unsigned long long { uint64_t };
+
+#else
+
+typedef long int int64_t;
+typedef unsigned long int uint64_t;
+%apply long int { int64_t };
+%apply unsigned long int { uint64_t };
+
+#endif
 
 namespace std{
     %template(map_string_string) map<string, string>;
 }
 
+%apply uint64_t &OUTPUT { uint64_t &base_addr, uint64_t &var_size, uint64_t &dev_addr, uint64_t &memory_size };
+
 %{
 #include "tf_adapter/util/npu_plugin.h"
+
+extern int32_t RdmaInitAndRegister(const std::vector<ge::HostVarInfo> &var_info, size_t size);
+
+extern int32_t GetVarAddrAndSize(const std::string &var_name, uint64_t &base_addr, uint64_t &var_size);
+
+extern int32_t MallocSharedMem(const ge::TensorInfo &tensor_info, uint64_t &dev_addr, uint64_t &memory_size);
 %}
 
+%template(var_info_vec) std::vector<ge::HostVarInfo>;
+%template(int64_vec) std::vector<int64_t>;
+
 extern const char* const AUTO_TUNE_MODE;
 extern const char* const OP_DEBUG_LEVEL;
 extern const char* const OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES;
@@ -17,7 +53,55 @@ extern const char* const OPTION_EXEC_PROFILING_MODE;
 extern const char* const OPTION_EXEC_PROFILING_OPTIONS;
 extern const char* const OPTION_GRAPH_RUN_MODE;
 extern const char* const OPTION_EXEC_HCCL_FLAG;
+extern const char* const OPTION_EXEC_PROFILING_FPPONIT_OPTIONS;
+extern const char* const OPTION_EXEC_PROFILING_BPPONIT_OPTIONS;
 
 extern void PluginInit(std::map<std::string, std::string>& init_options);
 
 extern void PluginFinalize();
+
+namespace ge{
+  struct HostVarInfo {
+    uint64_t base_addr;
+    uint64_t var_size;
+  };
+  enum DataType {
+    DT_FLOAT = 0,            // float type
+    DT_FLOAT16 = 1,          // fp16 type
+    DT_INT8 = 2,             // int8 type
+    DT_INT16 = 6,            // int16 type
+    DT_UINT16 = 7,           // uint16 type
+    DT_UINT8 = 4,            // uint8 type
+    DT_INT32 = 3,            // int32 type
+    DT_INT64 = 9,            // int64 type
+    DT_UINT32 = 8,           // unsigned int32
+    DT_UINT64 = 10,          // unsigned int64
+    DT_BOOL = 12,            // bool type
+    DT_DOUBLE = 11,          // double type
+    DT_STRING = 13,          // string type
+    DT_DUAL_SUB_INT8 = 14,   // dual output int8 type
+    DT_DUAL_SUB_UINT8 = 15,  // dual output uint8 type
+    DT_COMPLEX64 = 16,       // complex64 type
+    DT_COMPLEX128 = 17,      // complex128 type
+    DT_QINT8 = 18,           // qint8 type
+    DT_QINT16 = 19,          // qint16 type
+    DT_QINT32 = 20,          // qint32 type
+    DT_QUINT8 = 21,          // quint8 type
+    DT_QUINT16 = 22,         // quint16 type
+    DT_RESOURCE = 23,        // resource type
+    DT_STRING_REF = 24,      // string ref type
+    DT_DUAL = 25,            // dual output type
+    DT_UNDEFINED             // Used to indicate a DataType field has not been set.
+  };
+  struct TensorInfo {
+    std::string var_name;
+    std::vector<int64_t> dims;
+    DataType data_type;
+  };
+}
+
+extern int32_t RdmaInitAndRegister(const std::vector<ge::HostVarInfo> &var_info, size_t size);
+
+extern int32_t GetVarAddrAndSize(const std::string &var_name, uint64_t &base_addr, uint64_t &var_size);
+
+extern int32_t MallocSharedMem(const ge::TensorInfo &tensor_info, uint64_t &dev_addr, uint64_t &memory_size);
\ No newline at end of file
diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc
index 1f09948d253f5fe21548f8fb2a7f9956865b76a0..ad598b0d6b2c2cba380e4764e93ea3df21a4f85f 100644
--- a/tf_adapter/util/ge_plugin.cc
+++ b/tf_adapter/util/ge_plugin.cc
@@ -27,18 +27,18 @@ limitations under the License.
 
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/types.h"
-#include "framework/omg/parser/model_parser.h"
 #include "framework/omg/parser/parser_api.h"
-#include "framework/omg/parser/parser_factory.h"
+#include "framework/omg/omg_inner_types.h"
 #include "ge/ge_api.h"
 #include "ge/ge_api_types.h"
 #include "tdt/tdt_host_interface.h"
-#include "tdt/tsd_client.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tf_adapter/common/common.h"
 #include "tf_adapter/util/npu_attrs.h"
 #include "tf_adapter/util/npu_plugin.h"
 #include <thread>
+#include "nlohmann/json.hpp"
+using json = nlohmann::json;
 
 using namespace tensorflow;
 using namespace tdt;
@@ -71,15 +71,33 @@ void GePlugin::Init(std::map<std::string, std::string> &init_options, bool is_gl
     return;
   }
 
+  const char *tf_config = std::getenv("TF_CONFIG");
+  int exec_hccl_flag = 1;
+  if (tf_config != nullptr) {
+    json config_info;
+    try {
+        config_info = json::parse(tf_config);
+    } catch (json::exception &e) {
+        LOG(WARNING) << "[GePlugin] Failed to convert TF_CONFIG info from string to json ,reason: " << e.what();
+    }
+    if (config_info.is_object()) {
+      if (config_info["task"]["type"] == "ps") {
+        LOG(INFO) << "The ps process does not need to be initialized";
+        return;
+      }
+      if (config_info["task"]["type"] == "evaluator") {
+        exec_hccl_flag = 0;
+      }
+    }
+  }
+  init_options[OPTION_EXEC_HCCL_FLAG] = std::to_string(exec_hccl_flag);
+
   LOG(INFO) << "[GePlugin] graph run mode : " << init_options[ge::OPTION_GRAPH_RUN_MODE];
-  // prepare options for ge Initialize
 
-  const int64 kMaxDeviceID = 7;
-  (void) ReadInt64FromEnvVar("DEVICE_ID", 0, &device_id_);
-  if (device_id_ < 0 || device_id_ > kMaxDeviceID) {
-    LOG(WARNING) << "[GePlugin] device_id should in [0, 7]. use default device id : 0.";
-  }
+  Status s = GetEnvDeviceID(device_id_);
+  if (!s.ok()) { LOG(FATAL) << s.error_message(); }
   init_options[ge::OPTION_EXEC_DEVICE_ID] = std::to_string(device_id_);
+  LOG(INFO) << "[GePlugin] device id : " << init_options[ge::OPTION_EXEC_DEVICE_ID];
 
   const char *env_job_id = std::getenv("JOB_ID");
   if (env_job_id != nullptr) {
@@ -97,34 +115,35 @@ void GePlugin::Init(std::map<std::string, std::string> &init_options, bool is_gl
 
   bool is_use_hcom = false;
   bool deploy_mode = false;
-  char *env_rank_id = std::getenv("RANK_ID");
-  char *env_pod_name = std::getenv("POD_NAME");
   char *env_rank_table_file = std::getenv("RANK_TABLE_FILE");
   if ((env_rank_table_file != nullptr) && (rankSizeNum > 0)) {
     LOG(INFO) << "[GePlugin] env RANK_TABLE_FILE:" << env_rank_table_file;
     is_use_hcom = true;
     init_options[ge::OPTION_EXEC_RANK_TABLE_FILE] = env_rank_table_file;
+    char *env_pod_name = std::getenv("POD_NAME");
     if (env_pod_name != nullptr) {
       deploy_mode = true;
       init_options[ge::OPTION_EXEC_POD_NAME] = env_pod_name;
-    } else if (env_rank_id != nullptr) {
-      LOG(INFO) << "[GePlugin] env RANK_ID:" << env_rank_id;
-      deploy_mode = false;
-      init_options[ge::OPTION_EXEC_RANK_ID] = env_rank_id;
     } else {
-      LOG(ERROR) << "[GePlugin] Can't find rank_id or pod_name in env.";
+      char *env_rank_id = std::getenv("RANK_ID");
+      if (env_rank_id != nullptr) {
+        LOG(INFO) << "[GePlugin] env RANK_ID:" << env_rank_id;
+        deploy_mode = false;
+        init_options[ge::OPTION_EXEC_RANK_ID] = env_rank_id;
+      } else {
+        LOG(ERROR) << "[GePlugin] Can't find rank_id or pod_name in env.";
+      }
     }
   }
 
   init_options[ge::OPTION_EXEC_IS_USEHCOM] = std::to_string(is_use_hcom);
   init_options[ge::OPTION_EXEC_DEPLOY_MODE] = std::to_string(deploy_mode);
 
-  // tailing optimization
-  LOG(INFO) << "[GePlugin] is_tailing_optimization : " << init_options["ge.exec.isTailingOptimization"];
-
   // profiling configuration
   LOG(INFO) << "[GePlugin] profiling_mode : " << init_options[ge::OPTION_EXEC_PROFILING_MODE]
-            << ", profiling_options:" << init_options[ge::OPTION_EXEC_PROFILING_OPTIONS];
+            << ", profiling_options:" << init_options[ge::OPTION_EXEC_PROFILING_OPTIONS]
+            << ", fp_point: " << init_options[ge::OPTION_EXEC_PROFILING_FPPONIT_OPTIONS]
+            << ", bp_point: " << init_options[ge::OPTION_EXEC_PROFILING_BPPONIT_OPTIONS];
 
   // mix precision configuration
   LOG(INFO) << "[GePlugin] precision_mode : " << init_options[ge::PRECISION_MODE];
@@ -138,6 +157,18 @@ void GePlugin::Init(std::map<std::string, std::string> &init_options, bool is_gl
   // scope fusion configuration
   LOG(INFO) << "[GePlugin] enable_scope_fusion_passes : " << init_options[ge::OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES];
 
+  // exception dump configuration
+  LOG(INFO) << "[GePlugin] enable_exception_dump : " << init_options["ge.exec.enable_exception_dump"];
+
+  LOG(INFO) << "[GePlugin] job_id : " << init_options[ge::OPTION_EXEC_JOB_ID];
+
+  // mstune mode and work path
+  if (init_options["ge.buildMode"] == "4") {
+    init_options["ge.buildMode"] = "tuning";
+  }
+  LOG(INFO) << "[GePlugin] mstune mode : " << init_options["ge.buildMode"]
+            << ", work path : " << init_options["ge.tuningPath"];
+
   // Open TsdClient first, then call GEInitialize
   LOG(INFO) << "[GePlugin] Open TsdClient and Init tdt host.";
   int32_t ret = tdt::TdtHostInit(static_cast<uint32_t>(device_id_));
@@ -145,20 +176,14 @@ void GePlugin::Init(std::map<std::string, std::string> &init_options, bool is_gl
     std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime));
     LOG(FATAL) << "[GePlugin] Tdt host init failed, tdt error code : " << ret;
   }
-  TDT_StatusT tdt_status = TsdOpen(static_cast<uint32_t>(device_id_), static_cast<uint32_t>(rankSizeNum));
-  if (tdt_status != TDT_OK) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime));
-    LOG(FATAL) << "[GePlugin] Open TsdClient failed, tdt error code : " << tdt_status
-               << ", error message : " << TDT_GET_ERROR_STR(tdt_status);
-  }
-  LOG(INFO) << "[GePlugin] Open TsdClient success and tdt host init success.";
-
+ 
   // ge Initialize
   ge::Status status = ge::GEInitialize(init_options);
   if (status != ge::SUCCESS) {
     std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime));
     LOG(FATAL) << "[GePlugin] Initialize ge failed, ret : " << ToString(status);
   }
+  domi::GetContext().train_flag = true;
   LOG(INFO) << "[GePlugin] Initialize ge success.";
 
   // parser Initialize
@@ -189,13 +214,6 @@ void GePlugin::Finalize() {
 
   LOG(INFO) << "[GePlugin] Close TsdClient and destroy tdt.";
   int32_t ret = tdt::TdtHostDestroy();
-  if (ret != 0) { LOG(ERROR) << "[GePlugin] Close tdt failed, tdt_ret : " << ret; }
-  TDT_StatusT tdt_status = TsdClose(device_id_);
-  if (tdt_status != TDT_OK) {
-    LOG(ERROR) << "[GePlugin] Close TsdClient failed, tdt_ret : " << tdt_status;
-  } else {
-    LOG(INFO) << "[GePlugin] Close TsdClient success.";
-  }
   isInit_ = false;
 }
 
@@ -206,10 +224,46 @@ bool GePlugin::IsGlobal() {
 
 void PluginInit(std::map<std::string, std::string> &init_options) {
   GePlugin::GetInstance()->Init(init_options, true);
-  LOG(INFO) << "npu plugin init success";
+  LOG(INFO) << "[GePlugin] npu plugin init success";
 }
 
 void PluginFinalize() {
   GePlugin::GetInstance()->Finalize();
-  LOG(INFO) << "npu plugin finalize success";
+  LOG(INFO) << "[GePlugin] npu plugin finalize success";
+}
+
+int32_t RdmaInitAndRegister(const std::vector<ge::HostVarInfo> &var_info, size_t size) {
+  ge::Status ret = ge::InitRdmaPool(size);
+  if (ret != ge::SUCCESS) {
+    LOG(ERROR) << "[GePlugin] init rdma pool failed, ret : " << ToString(ret);
+    return -1;
+  }
+  LOG(INFO) << "[GePlugin] init rdma pool success.";
+  ret = ge::RdmaRemoteRegister(var_info);
+  if (ret != ge::SUCCESS) {
+    LOG(ERROR) << "[GePlugin] rdma remote register failed, ret : " << ToString(ret);
+    return -1;
+  }
+  LOG(INFO) << "[GePlugin] rdma remote register success.";
+  return 0;
+}
+
+int32_t GetVarAddrAndSize(const string &var_name, uint64_t &base_addr, uint64_t &var_size) {
+  ge::Status ret = ge::GetVarBaseAddrAndSize(var_name, base_addr, var_size);
+  if (ret != ge::SUCCESS) {
+    LOG(ERROR) << "[GePlugin] get " << var_name << " base addr and size failed, ret : " << ToString(ret);
+    return -1;
+  }
+  LOG(INFO) << "[GePlugin] get " << var_name << " base addr and size success.";
+  return 0;
 }
+
+int32_t MallocSharedMem(const ge::TensorInfo &tensor_info, uint64_t &dev_addr, uint64_t &memory_size) {
+  ge::Status ret = ge::MallocSharedMemory(tensor_info, dev_addr, memory_size);
+  if (ret != ge::SUCCESS) {
+    LOG(ERROR) << "[GePlugin] malloc shared memory failed, ret : " << ToString(ret);
+    return -1;
+  }
+  LOG(INFO) << "[GePlugin] malloc shared memory success.";
+  return 0;
+}
\ No newline at end of file
diff --git a/tf_adapter/util/ge_plugin.h b/tf_adapter/util/ge_plugin.h
index 5f7b3483f9196276e8be5acd6815dd1db0209a59..e83d73cb7ec049973d215007770443cc067fb28d 100644
--- a/tf_adapter/util/ge_plugin.h
+++ b/tf_adapter/util/ge_plugin.h
@@ -51,7 +51,7 @@ class GePlugin {
 
   ~GePlugin();
 
-  int64 device_id_;
+  uint32_t device_id_;
   bool isInit_;
   bool isGlobal_;
   std::map<std::string, std::string> init_options_;
diff --git a/tf_adapter/util/generate_report.cc b/tf_adapter/util/generate_report.cc
index 9ee6d53450faa1c3bf8296a4a0e3b06d76b75915..30b19cfc25dd04c3d971933001aab77e58c61c70 100644
--- a/tf_adapter/util/generate_report.cc
+++ b/tf_adapter/util/generate_report.cc
@@ -1,5 +1,17 @@
-/* Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -10,16 +22,20 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the License.*/
+limitations under the License.
+==============================================================================*/
 
+#include "tf_adapter/util/generate_report.h"
+
+#include <cstdio>
+#include <unistd.h>
 #include "nlohmann/json.hpp"
 #include "tensorflow/core/platform/env.h"
-#include "tf_adapter/util/generate_report.h"
 
 namespace tensorflow {
 using Json = nlohmann::json;
 const static uint32_t kInterval = 2;
-const static std::string kUnsupportedInfoPath = "checkresult.tf.json";
+const static std::string kUnsupportedInfoPath = "check_result.tf.json";
 
 // json file keys
 const static std::string kKeyName = "name";
@@ -31,9 +47,15 @@ const static std::string kKeyIsSupport = "is_support";
 const static std::string kKeyMessage = "message";
 
 GenerateReport::GenerateReport() {
-  char *need_save = std::getenv("ENABLE_NETWORK_ANALYSIS");
-  if (need_save != nullptr && strcmp("1", need_save) == 0) {
-    save_report_ = true;
+  char current_path[PATH_MAX];
+  if (getcwd(current_path, PATH_MAX) != nullptr){
+    string path = current_path;
+    path = path + "/" + kUnsupportedInfoPath;
+    if (remove(path.c_str()) == -1){
+      LOG(WARNING) << "[GenerateReport] Remove check report failed. path:" << path;
+    } else {
+      LOG(INFO) << "[GenerateReport] Remove check report success. path:" << path;
+    }
   }
 }
 
@@ -45,11 +67,8 @@ GenerateReport *GenerateReport::GetInstance() {
 Status GenerateReport::AddUnSupportedInfo(Node *node, Details &infos) {
   return GenerateReport::AddUnSupportedInfo(node->name(), node->type_string(), infos);
 }
-Status GenerateReport::AddUnSupportedInfo(const std::string &name, const std::string &type, Details &infos) {
-  if (!save_report_) {
-    return Status::OK();
-  }
 
+Status GenerateReport::AddUnSupportedInfo(const std::string &name, const std::string &type, Details &infos) {
   if (check_info_map_.find(name) != check_info_map_.end()) {
     return Status::OK();
   } else {
@@ -63,9 +82,6 @@ Status GenerateReport::AddUnSupportedInfo(const std::string &name, const std::st
 }
 
 Status GenerateReport::DeleteUnSupportedInfo(Node *node) {
-  if (!save_report_) {
-    return Status::OK();
-  }
   auto info_iter = check_info_map_.find(node->name());
   if (info_iter == check_info_map_.end()) {
     return Status::OK();
@@ -76,20 +92,21 @@ Status GenerateReport::DeleteUnSupportedInfo(Node *node) {
 }
 
 Status GenerateReport::SaveUnsupportedInfo() {
-  if (!save_report_) {
+  if (check_info_map_.empty()){
+    LOG(INFO) << "[GenerateReport] All nodes are supported, no need to save report.";
     return Status::OK();
   }
   Json graph_info;
-  for (auto info : check_info_map_) {
-    Json reason = {{kKeyCode, info.second.info_details.code}, {kKeyMessage, info.second.info_details.message}};
-    Json op = {{kKeyName, info.second.name},
-               {kKeyType, info.second.type},
-               {kKeyIsSupport, info.second.is_support},
-               {kKeyReason, reason}};
-    graph_info[kKeyOp].push_back(op);
-  }
   std::string info_str;
   try {
+    for (auto info : check_info_map_) {
+      Json reason = {{kKeyCode, info.second.info_details.code}, {kKeyMessage, info.second.info_details.message}};
+      Json op = {{kKeyName, info.second.name},
+                 {kKeyType, info.second.type},
+                 {kKeyIsSupport, info.second.is_support},
+                 {kKeyReason, reason}};
+      graph_info[kKeyOp].push_back(op);
+    }
     info_str = graph_info.dump(kInterval, ' ', false, Json::error_handler_t::ignore);
   } catch (std::exception &e) {
     return errors::Internal("Failed to convert json to string ,reason:", e.what());
@@ -98,4 +115,6 @@ Status GenerateReport::SaveUnsupportedInfo() {
   }
   return tensorflow::WriteStringToFile(Env::Default(), kUnsupportedInfoPath, info_str);
 }
+
+GenerateReport::~GenerateReport(){};
 }  // namespace tensorflow
diff --git a/tf_adapter/util/generate_report.h b/tf_adapter/util/generate_report.h
index e4f35ad4d4923ef6009eb22fd0bba0ed0a48a0bc..705483ad892d87e6797486ad0a2e4637db8d2327 100644
--- a/tf_adapter/util/generate_report.h
+++ b/tf_adapter/util/generate_report.h
@@ -32,7 +32,7 @@ limitations under the License.
 // Op will be written to json if it can not sink to device during one excute.
 namespace tensorflow {
 class GenerateReport {
- public:
+public:
   struct Details {
     int code;
     std::string message;
@@ -40,12 +40,18 @@ class GenerateReport {
   enum ReasonCode { TypeNoDefine = 1, TypeGray = 2, ScenarioProblems = 3, NotSupport = 4 };
 
   static GenerateReport *GetInstance();
+
   Status AddUnSupportedInfo(const std::string &name, const std::string &type, Details &infos);
+
   Status AddUnSupportedInfo(Node *node, Details &infos);
+
   Status DeleteUnSupportedInfo(Node *node);
+
   Status SaveUnsupportedInfo();
 
- private:
+  ~GenerateReport();
+
+private:
   GenerateReport();
   struct UnSupportedInfo {
     std::string name;
@@ -54,7 +60,6 @@ class GenerateReport {
     Details info_details;
   };
   std::map<std::string, UnSupportedInfo> check_info_map_;
-  bool save_report_ = false;
 };
 }  // namespace tensorflow
 
diff --git a/tf_adapter/util/infershape_util.cc b/tf_adapter/util/infershape_util.cc
index 6aa07d1d981b9e00c437ee36ae12a86c5386de9d..090433ad61c125ac40973560e1631950cc5f8f36 100644
--- a/tf_adapter/util/infershape_util.cc
+++ b/tf_adapter/util/infershape_util.cc
@@ -79,18 +79,26 @@ Status InferShapeUtil::setArgShapeFromTensorShape(std::vector<Tensor> vecTensor,
   return Status::OK();
 }
 
-Status InferShapeUtil::getSubGraphFromFunctionDef(const FunctionDef &func_def, Graph *graph) {
+Status InferShapeUtil::GetSubGraphFromFunctionDef(const FunctionLibraryDefinition &flib_def,
+                                                  const FunctionDef &func_def, Graph *graph) {
   LOG(INFO) << "The signature name of FunctionDef is " << func_def.signature().name() << ".";
   InstantiationResult result;
   AttrSlice attrs(&func_def.attr());
   TF_RETURN_IF_ERROR(InstantiateFunction(
-      func_def, attrs, [](const string &op, const OpDef **sig) { return OpRegistry::Global()->LookUpOpDef(op, sig); },
-      &result));
+      func_def, attrs, [&flib_def](const string &op, const OpDef **sig) {
+        Status s = OpRegistry::Global()->LookUpOpDef(op, sig);
+        if (!s.ok()) {
+          return flib_def.LookUpOpDef(op, sig);
+        }
+        return s;
+      }, &result));
 
+  LOG(INFO) << "InstantiateFunction " << func_def.signature().name() << " success.";
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   opts.expect_device_spec = false;
   TF_RETURN_IF_ERROR(ConvertNodeDefsToGraph(opts, result.nodes, graph));
+  LOG(INFO) << "ConvertNodeDefsToGraph " << func_def.signature().name() << " success.";
   return Status::OK();
 }
 
@@ -343,7 +351,7 @@ Status InferShapeUtil::InferShape(const std::vector<Tensor> &vecTensor, const Fu
     return errors::Internal("Input tensor num ", iTensorNums, " is less than arg num ", iInputArgNums, ".");
   }
 
-  TF_RETURN_IF_ERROR(getSubGraphFromFunctionDef(*func_def, graph));
+  TF_RETURN_IF_ERROR(GetSubGraphFromFunctionDef(*flib_def, *func_def, graph));
 
   // Control flow loops in the graph; we have to break them.
   std::vector<EdgeInfo> NextIterationEdges;
diff --git a/tf_adapter/util/infershape_util.h b/tf_adapter/util/infershape_util.h
index b47ad53c6054d4a4650e9c6581c8e5230bb4036e..5e89bda0ff15410016f5e112a4a6fc777560671a 100644
--- a/tf_adapter/util/infershape_util.h
+++ b/tf_adapter/util/infershape_util.h
@@ -50,7 +50,8 @@ class InferShapeUtil {
                            const FunctionLibraryDefinition *flib_def,
                            const FunctionDef *func_def, Graph *graph);
 
-  static Status getSubGraphFromFunctionDef(const FunctionDef &func_def, Graph *graph);
+  static Status GetSubGraphFromFunctionDef(const FunctionLibraryDefinition &flib_def,
+                                           const FunctionDef &func_def, Graph *graph);
 
   static int64 GetCurrentTimestap();
   static bool IsInitializedGraph(Node *node);
diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc
index af853c43ab68b3b3b0a28ffb7f04f961acd21dd3..bec36218f3ce38da380f5727ba18da541bf64169 100644
--- a/tf_adapter/util/npu_attrs.cc
+++ b/tf_adapter/util/npu_attrs.cc
@@ -25,13 +25,46 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tdt/index_transform.h"
 #include "tf_adapter/util/npu_attrs.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/env_var.h"
 #include "securec.h"
+#include "mmpa/mmpa_api.h"
 #include <mutex>
 #include <regex>
 
 namespace tensorflow {
-
+Status GetEnvDeviceID(uint32_t &device_id) {
+  int64 phy_device_id = -1;
+  int64 logic_device_id = -1;
+  const char* tmp_ascend_device_id = std::getenv("ASCEND_DEVICE_ID");
+  string env_ascend_device_id(tmp_ascend_device_id == nullptr ? "" : tmp_ascend_device_id);
+  const char* tmp_device_id = std::getenv("DEVICE_ID");
+  string env_device_id(tmp_device_id == nullptr ? "" : tmp_device_id);
+  if (env_ascend_device_id.empty() && env_device_id.empty()) {
+    LOG(WARNING) << "[GePlugin] DEVICE_ID and ASCEND_DEVICE_ID is none, use default device id : 0";
+  } else if (!env_ascend_device_id.empty()) {
+    if (!strings::safe_strto64(env_ascend_device_id, &logic_device_id)) {
+      return errors::InvalidArgument("ASCEND_DEVICE_ID is valid, not digit.");
+    }
+    if (logic_device_id < 0) {
+      return errors::InvalidArgument("ASCEND_DEVICE_ID should be >= 0.");
+    }
+    device_id = static_cast<uint32_t>(logic_device_id);
+  } else {
+    if (!strings::safe_strto64(env_device_id, &phy_device_id)) {
+      return errors::InvalidArgument("DEVICE_ID is valid, not digit.");
+    }
+    if (phy_device_id < 0) {
+      return errors::InvalidArgument("DEVICE_ID should be >= 0.");
+    }
+    if (IndexTransform(static_cast<uint32_t>(phy_device_id), device_id) != 0) {
+      return errors::InvalidArgument("get logic device id by DEVICE_ID failed.");
+    }
+  }
+  return Status::OK();
+}
 inline void split(const std::string &s, std::vector<std::string> &result, const char *delchar = " ") {
   if (s.empty()) { return; }
   result.clear();
@@ -44,7 +77,9 @@ inline void split(const std::string &s, std::vector<std::string> &result, const
   }
   char *p_tmp = nullptr;
   char *p = strtok_s(buffer, delchar, &p_tmp);
-  do { result.emplace_back(p); } while ((p = strtok_s(nullptr, delchar, &p_tmp)));
+  if (p != nullptr) {
+    do { result.emplace_back(p); } while ((p = strtok_s(nullptr, delchar, &p_tmp)));
+  }
   delete[] buffer;
 }
 
@@ -112,6 +147,41 @@ inline Status checkDumpDebugMode(const string &dump_debug_mode) {
   }
 }
 
+inline Status CheckPath(const string &input, string &output) {
+  if (mmIsDir(input.c_str()) != EN_OK) {
+    return errors::InvalidArgument("the path ", input.c_str(), " is not directory.");
+  }
+  char trusted_path[MMPA_MAX_PATH] = { "\0" };
+  if (mmRealPath(input.c_str(), trusted_path, MMPA_MAX_PATH) != EN_OK) {
+    return errors::InvalidArgument("the path ", input.c_str(), " is invalid.");
+  }
+  if (mmAccess2(trusted_path, R_OK | W_OK) != EN_OK) {
+    return errors::InvalidArgument("the path ", input.c_str(), " does't have read, write permissions.");
+  }
+  output = trusted_path;
+  return Status::OK();
+}
+
+inline Status CheckOpImplMode(const string &op_select_implmode) {
+  std::set<string> op_impl_mode_list = {"high_precision", "high_performance"};
+
+  if (op_impl_mode_list.find(op_select_implmode) != op_impl_mode_list.end()) {
+    return Status::OK();
+  } else {
+    return errors::InvalidArgument("op select impl mode should be one of the list:[high_precision, high_performance]");
+  }
+}
+
+inline Status CheckMstuneMode(const string &mstune_mode) {
+  std::set<string> mstune_mode_list = {"1", "2", "3", "4"};
+
+  if (mstune_mode_list.find(mstune_mode) != mstune_mode_list.end()) {
+    return Status::OK();
+  } else {
+    return errors::InvalidArgument("mstune mode should be one of the list:['1', '2', '3', '4']");
+  }
+}
+
 std::map<std::string, std::string> NpuAttrs::GetSessOptions(OpKernelConstruction *ctx) {
   std::map<std::string, std::string> sess_options;
   std::string variable_format_optimize = std::to_string(true);
@@ -126,6 +196,11 @@ std::map<std::string, std::string> NpuAttrs::GetSessOptions(OpKernelConstruction
   std::string dump_debug_mode = "all";
   std::string stream_max_parallel_num;
   string npuOptimizer;
+  std::string is_tailing_optimization = std::to_string(false);
+  std::string op_select_implmode;
+  std::string optypelist_for_implmode;
+  string input_shape;
+  string dynamic_dims;
 
   if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) {
     ctx->GetAttr("_variable_format_optimize", &variable_format_optimize);
@@ -154,6 +229,11 @@ std::map<std::string, std::string> NpuAttrs::GetSessOptions(OpKernelConstruction
       }
     }
     ctx->GetAttr("_stream_max_parallel_num", &stream_max_parallel_num);
+    ctx->GetAttr("_is_tailing_optimization", &is_tailing_optimization);
+    ctx->GetAttr("_op_select_implmode", &op_select_implmode);
+    ctx->GetAttr("_optypelist_for_implmode", &optypelist_for_implmode);
+    ctx->GetAttr("_input_shape", &input_shape);
+    ctx->GetAttr("_dynamic_dims", &dynamic_dims);
   }
 
   // session options
@@ -168,26 +248,33 @@ std::map<std::string, std::string> NpuAttrs::GetSessOptions(OpKernelConstruction
   sess_options[ge::OPTION_EXEC_DUMP_MODE] = dump_mode;
   sess_options[ge::OPTION_EXEC_ENABLE_DUMP_DEBUG] = enable_dump_debug;
   sess_options[ge::OPTION_EXEC_DUMP_DEBUG_MODE] = dump_debug_mode;
+  sess_options["ge.exec.isTailingOptimization"] = is_tailing_optimization;
+  sess_options[ge::OP_SELECT_IMPL_MODE] = op_select_implmode;
+  sess_options[ge::OPTYPELIST_FOR_IMPLMODE] = optypelist_for_implmode;
+  sess_options["ge.inputShape"] = input_shape;
+  sess_options["ge.dynamicDims"] = dynamic_dims;
 
   return sess_options;
 }
 
 std::map<std::string, std::string> NpuAttrs::GetDefaultInitOptions() {
   std::map<std::string, std::string> init_options;
-  init_options["ge.exec.isTailingOptimization"] = std::to_string(false);
-  init_options["ge.exec.precision_mode"] = "";
+  init_options["ge.exec.precision_mode"] = "allow_fp32_to_fp16";
   init_options[ge::OPTION_EXEC_PROFILING_MODE] = std::to_string(false);
   init_options[ge::OPTION_EXEC_PROFILING_OPTIONS] = "training_trace";
   init_options[ge::AUTO_TUNE_MODE] = "";
   init_options[ge::OPTION_GRAPH_RUN_MODE] = "1";
   init_options[ge::OP_DEBUG_LEVEL] = "0";
   init_options[ge::OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES] = "";
+  init_options[ge::OPTION_EXEC_PROFILING_FPPONIT_OPTIONS] = "";
+  init_options[ge::OPTION_EXEC_PROFILING_BPPONIT_OPTIONS] = "";
+  init_options["ge.buildMode"] = "";
+  init_options["ge.tuningPath"] = "";
   return init_options;
 }
 
 std::map<std::string, std::string> NpuAttrs::GetInitOptions(OpKernelConstruction *ctx) {
   std::map<std::string, std::string> init_options;
-  std::string is_tailing_optimization = std::to_string(false);
   std::string precision_mode;
   std::string profiling_mode = std::to_string(false);
   std::string profiling_options = "training_trace";
@@ -195,10 +282,14 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(OpKernelConstruction
   std::string graph_run_mode = "1";
   std::string op_debug_level = "0";
   std::string enable_scope_fusion_passes;
+  std::string enable_exception_dump;
   string npuOptimizer;
+  string bp_point;
+  string fp_point;
+  string mstune_mode;
+  string work_path;
 
   if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) {
-    ctx->GetAttr("_is_tailing_optimization", &is_tailing_optimization);
     ctx->GetAttr("_precision_mode", &precision_mode);
     ctx->GetAttr("_profiling_mode", &profiling_mode);
     ctx->GetAttr("_profiling_options", &profiling_options);
@@ -206,9 +297,13 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(OpKernelConstruction
     ctx->GetAttr("_graph_run_mode", &graph_run_mode);
     ctx->GetAttr("_op_debug_level", &op_debug_level);
     ctx->GetAttr("_enable_scope_fusion_passes", &enable_scope_fusion_passes);
+    ctx->GetAttr("_bp_point", &bp_point);
+    ctx->GetAttr("_fp_point", &fp_point);
+    ctx->GetAttr("_enable_exception_dump", &enable_exception_dump);
+    ctx->GetAttr("_mstune_mode", &mstune_mode);
+    ctx->GetAttr("_work_path", &work_path);
   }
 
-  init_options["ge.exec.isTailingOptimization"] = is_tailing_optimization;
   init_options["ge.exec.precision_mode"] = precision_mode;
   init_options[ge::OPTION_EXEC_PROFILING_MODE] = profiling_mode;
   if (profiling_mode != std::to_string(false) && !checkProfilingOptions(profiling_options)) {
@@ -219,6 +314,11 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(OpKernelConstruction
   init_options[ge::OPTION_GRAPH_RUN_MODE] = graph_run_mode;
   init_options[ge::OP_DEBUG_LEVEL] = op_debug_level;
   init_options[ge::OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES] = enable_scope_fusion_passes;
+  init_options[ge::OPTION_EXEC_PROFILING_BPPONIT_OPTIONS] = bp_point;
+  init_options[ge::OPTION_EXEC_PROFILING_FPPONIT_OPTIONS] = fp_point;
+  init_options["ge.exec.enable_exception_dump"] = enable_exception_dump;
+  init_options["ge.buildMode"] = mstune_mode;
+  init_options["ge.tuningPath"] = work_path;
 
   return init_options;
 }
@@ -380,7 +480,16 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(AttrSlice attrs)
   std::string graph_run_mode = "1";
   std::string op_debug_level = "0";
   std::string enable_scope_fusion_passes;
+  std::string enable_exception_dump;
   string npuOptimizer;
+  string bp_point;
+  string fp_point;
+  std::string op_select_implmode;
+  std::string optypelist_for_implmode;
+  string input_shape;
+  string dynamic_dims;
+  string mstune_mode;
+  string work_path;
 
   if (attrs.Find("_NpuOptimizer") != nullptr) {
     do_npu_optimizer = std::to_string(true);
@@ -454,6 +563,21 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(AttrSlice attrs)
     if (attrs.Find("_enable_scope_fusion_passes") != nullptr) {
       enable_scope_fusion_passes = attrs.Find("_enable_scope_fusion_passes")->s();
     }
+    if (attrs.Find("_fp_point") != nullptr) { fp_point = attrs.Find("_fp_point")->s(); }
+    if (attrs.Find("_bp_point") != nullptr) { bp_point = attrs.Find("_bp_point")->s(); }
+    if (attrs.Find("_enable_exception_dump") != nullptr) {
+      enable_exception_dump = attrs.Find("_enable_exception_dump")->s();
+    }
+    if (attrs.Find("_op_select_implmode") != nullptr) {
+      op_select_implmode = attrs.Find("_op_select_implmode")->s();
+    }
+    if (attrs.Find("_optypelist_for_implmode") != nullptr) {
+      optypelist_for_implmode = attrs.Find("_optypelist_for_implmode")->s();
+    }
+    if (attrs.Find("_input_shape") != nullptr) { input_shape = attrs.Find("_input_shape")->s(); }
+    if (attrs.Find("_dynamic_dims") != nullptr) { dynamic_dims = attrs.Find("_dynamic_dims")->s(); }
+    if (attrs.Find("_mstune_mode") != nullptr) { mstune_mode = attrs.Find("_mstune_mode")->s(); }
+    if (attrs.Find("_work_path") != nullptr) { work_path = attrs.Find("_work_path")->s(); }
   }
 
   all_options["variable_format_optimize"] = variable_format_optimize;
@@ -480,6 +604,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(AttrSlice attrs)
   all_options["graph_run_mode"] = graph_run_mode;
   all_options["op_debug_level"] = op_debug_level;
   all_options["enable_scope_fusion_passes"] = enable_scope_fusion_passes;
+  all_options["enable_exception_dump"] = enable_exception_dump;
 
   all_options["do_npu_optimizer"] = do_npu_optimizer;
   all_options["enable_data_pre_proc"] = enable_dp;
@@ -489,6 +614,14 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(AttrSlice attrs)
   all_options["lower_functional_ops"] = lower_functional_ops;
   all_options["job"] = job;
   all_options["task_index"] = task_index;
+  all_options["fp_point"] = fp_point;
+  all_options["bp_point"] = bp_point;
+  all_options["op_select_implmode"] = op_select_implmode;
+  all_options["optypelist_for_implmode"] = optypelist_for_implmode;
+  all_options["input_shape"] = input_shape;
+  all_options["dynamic_dims"] = dynamic_dims;
+  all_options["mstune_mode"] = mstune_mode;
+  all_options["work_path"] = work_path;
 
   return all_options;
 }
@@ -539,6 +672,15 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   bool lower_functional_ops = false;
   string job = "default";
   int task_index = 0;
+  string bp_point;
+  string fp_point;
+  int enable_exception_dump = 0;
+  string op_select_implmode;
+  string optypelist_for_implmode;
+  string input_shape;
+  string dynamic_dims;
+  string mstune_mode;
+  string work_path;
 
   const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options();
   for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) {
@@ -555,7 +697,13 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
       if (params.count("enable_dump")) { enable_dump = params.at("enable_dump").b(); }
       if (params.count("enable_dump_debug")) { enable_dump_debug = params.at("enable_dump_debug").b(); }
       if (enable_dump || enable_dump_debug) {
-        if (params.count("dump_path")) { dump_path = params.at("dump_path").s(); }
+        if (params.count("dump_path")) {
+          string tmp_path = params.at("dump_path").s();
+          Status s = CheckPath(tmp_path, dump_path);
+          if (!s.ok()) { LOG(FATAL) << s.error_message(); }
+        } else {
+          LOG(FATAL) << "if use dump function, dump_path must be set.";
+        }
       }
       if (enable_dump) {
         if (params.count("dump_step")) {
@@ -583,17 +731,44 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
       if (params.count("is_tailing_optimization")) {
         is_tailing_optimization = params.at("is_tailing_optimization").b();
       }
-      if (params.count("precision_mode")) { precision_mode = params.at("precision_mode").s(); }
       if (params.count("profiling_mode")) { profiling_mode = params.at("profiling_mode").b(); }
       if (params.count("profiling_options") && profiling_mode) {
         profiling_options = params.at("profiling_options").s();
       }
+      if (params.count("fp_point")) { fp_point = params.at("fp_point").s(); }
+      if (params.count("bp_point")) { bp_point = params.at("bp_point").s(); }
       if (params.count("auto_tune_mode")) { auto_tune_mode = params.at("auto_tune_mode").s(); }
-      if (params.count("graph_run_mode")) { graph_run_mode = params.at("graph_run_mode").i(); }
+      if (params.count("graph_run_mode")) {
+        graph_run_mode = params.at("graph_run_mode").i();
+        if (graph_run_mode > 1) { LOG(FATAL) << "graph_run_mode value must be 0 or 1"; }
+      }
       if (params.count("op_debug_level")) { op_debug_level = params.at("op_debug_level").i(); }
       if (params.count("enable_scope_fusion_passes")) {
         enable_scope_fusion_passes = params.at("enable_scope_fusion_passes").s();
       }
+      int64 rank_size = 1;
+      (void) ReadInt64FromEnvVar("RANK_SIZE", 1, &rank_size);
+      if (rank_size > 1 && params.count("mstune_mode")) {
+        mstune_mode = params.at("mstune_mode").s();
+        Status s  = CheckMstuneMode(mstune_mode);
+        if (!s.ok()) { LOG(FATAL) << s.error_message(); }
+        if (params.count("work_path")) {
+          string tmp_path = params.at("work_path").s();
+          s = CheckPath(tmp_path, work_path);
+          if (!s.ok()) { LOG(FATAL) << s.error_message(); }
+        } else {
+          LOG(FATAL) << "work_path must be set when use mstune_mode.";
+        }
+      }
+      if (params.count("precision_mode")) {
+        precision_mode = params.at("precision_mode").s();
+      } else {
+        if (graph_run_mode) {
+          precision_mode = "allow_fp32_to_fp16";
+        } else {
+          precision_mode = "force_fp16";
+        }
+      }
 
       do_npu_optimizer = true;
       if (params.count("enable_data_pre_proc")) { enable_dp = params.at("enable_data_pre_proc").b(); }
@@ -609,6 +784,28 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
         }
         if (params.count("task_index")) { task_index = params.at("task_index").i(); }
       }
+      if (params.count("enable_exception_dump")) { enable_exception_dump = params.at("enable_exception_dump").i(); }
+      if (!params.count("op_select_implmode") && !params.count("optypelist_for_implmode")) {
+        op_select_implmode = "high_performance";
+      } else if (params.count("op_select_implmode") && !params.count("optypelist_for_implmode")) {
+        op_select_implmode = params.at("op_select_implmode").s();
+        Status s = CheckOpImplMode(op_select_implmode);
+        if (!s.ok()) { LOG(FATAL) << s.error_message(); }
+      } else if (params.count("optypelist_for_implmode") && !params.count("op_select_implmode")) {
+        LOG(FATAL) << "when use optypelist_for_implmode, op_select_implmode must be set.";
+      } else {
+        op_select_implmode = params.at("op_select_implmode").s();
+        Status s = CheckOpImplMode(op_select_implmode);
+        if (!s.ok()) { LOG(FATAL) << s.error_message(); }
+        optypelist_for_implmode = params.at("optypelist_for_implmode").s();
+      }
+      if (params.count("input_shape") && params.count("dynamic_dims")) {
+        input_shape = params.at("input_shape").s();
+        dynamic_dims = params.at("dynamic_dims").s();
+      } else if ((params.count("input_shape") && !params.count("dynamic_dims")) ||
+                 (!params.count("input_shape") && params.count("dynamic_dims"))) {
+        LOG(FATAL) << "input_shape and dynamic_dims should be paired.";
+      }
     }
   }
 
@@ -625,18 +822,34 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   sess_options["dump_mode"] = dump_mode;
   sess_options["enable_dump_debug"] = std::to_string(enable_dump_debug);
   sess_options["dump_debug_mode"] = dump_debug_mode;
+  sess_options["is_tailing_optimization"] = std::to_string(is_tailing_optimization);
+  sess_options["op_select_implmode"] = op_select_implmode;
+  sess_options["optypelist_for_implmode"] = optypelist_for_implmode;
+  sess_options["input_shape"] = input_shape;
+  sess_options["dynamic_dims"] = dynamic_dims;
 
-  init_options["is_tailing_optimization"] = std::to_string(is_tailing_optimization);
   init_options["precision_mode"] = precision_mode;
   init_options["profiling_mode"] = std::to_string(profiling_mode);
   if (profiling_mode && !checkProfilingOptions(profiling_options)) {
     LOG(FATAL) << "profiling options must be in 'training_trace', 'task_trace' or 'op_trace'";
   }
+  if (profiling_mode && (profiling_options.find("task_trace") != string::npos ||
+      profiling_options.find("training_trace") != string::npos)) {
+    if (bp_point == "" || fp_point == "") {
+      LOG(WARNING) << "profiling training_trace option should use with bp_point and fp_point";
+    } else {
+      init_options["bp_point"] = bp_point;
+      init_options["fp_point"] = fp_point;
+    }
+  }
   init_options["profiling_options"] = profiling_options;
   init_options["auto_tune_mode"] = auto_tune_mode;
   init_options["graph_run_mode"] = std::to_string(graph_run_mode);
   init_options["op_debug_level"] = std::to_string(op_debug_level);
   init_options["enable_scope_fusion_passes"] = enable_scope_fusion_passes;
+  init_options["enable_exception_dump"] = std::to_string(enable_exception_dump);
+  init_options["mstune_mode"] = mstune_mode;
+  init_options["work_path"] = work_path;
 
   pass_options["do_npu_optimizer"] = std::to_string(do_npu_optimizer);
   pass_options["enable_data_pre_proc"] = std::to_string(enable_dp);
diff --git a/tf_adapter/util/npu_attrs.h b/tf_adapter/util/npu_attrs.h
index 1f6526ad1d48573dd6f516ace7f196fe86fea024..9e37c0beb60a423b915da6aa3f4bca8a3f6428af 100644
--- a/tf_adapter/util/npu_attrs.h
+++ b/tf_adapter/util/npu_attrs.h
@@ -39,6 +39,7 @@ limitations under the License.
 
 // single load all npu mode
 namespace tensorflow {
+Status GetEnvDeviceID(uint32_t &device_id);
 class NpuAttrs {
  public:
   // This method returns instance Pointers
diff --git a/tf_adapter/util/npu_ops_identifier.cc b/tf_adapter/util/npu_ops_identifier.cc
index 6072ff7abeede8ed4799cf6d8345bd22991f5ec2..76b3e1f20dd4432822e8081dd4b2b0030028d01e 100644
--- a/tf_adapter/util/npu_ops_identifier.cc
+++ b/tf_adapter/util/npu_ops_identifier.cc
@@ -104,14 +104,7 @@ bool NpuOpsIdentifier::IsNpuSupported(const std::string &op_name, const std::str
     tensorflow::GenerateReport::GetInstance()->AddUnSupportedInfo(node_name, op_name, infos);
     return false;
   }
-  if (is_mix_ && ops_info_[op_name][kGray].is_boolean()) {
-    tensorflow::GenerateReport::Details infos;
-    static const std::string message = "This op will not be excuted on npu in mix_compile_mode.";
-    infos.code = tensorflow::GenerateReport::TypeGray;
-    infos.message = message;
-    tensorflow::GenerateReport::GetInstance()->AddUnSupportedInfo(node_name, op_name, infos);
-    return !ops_info_[op_name][kGray];
-  }
+  if (is_mix_ && ops_info_[op_name][kGray].is_boolean()) { return !ops_info_[op_name][kGray]; }
   return true;
 }
 // Determine if the node is performance-sensitive on NPU, this should
diff --git a/tf_adapter/util/npu_plugin.h b/tf_adapter/util/npu_plugin.h
index fe483d7a9f0e0f199c64b6877866c64d125ce140..327304ce053d711446c6d2083f5dc5b7166c579a 100644
--- a/tf_adapter/util/npu_plugin.h
+++ b/tf_adapter/util/npu_plugin.h
@@ -30,6 +30,7 @@ limitations under the License.
 
 #include "ge/ge_api_types.h"
 #include "ge_plugin.h"
+#include "framework/memory/memory_api.h"
 #include <map>
 #include <string>
 
@@ -40,9 +41,17 @@ const char *const OPTION_EXEC_PROFILING_MODE = ge::OPTION_EXEC_PROFILING_MODE;
 const char *const OPTION_EXEC_PROFILING_OPTIONS = ge::OPTION_EXEC_PROFILING_OPTIONS;
 const char *const OPTION_GRAPH_RUN_MODE = ge::OPTION_GRAPH_RUN_MODE;
 const char* const OPTION_EXEC_HCCL_FLAG = ge::OPTION_EXEC_HCCL_FLAG;
+const char* const OPTION_EXEC_PROFILING_FPPONIT_OPTIONS = ge::OPTION_EXEC_PROFILING_FPPONIT_OPTIONS;
+const char* const OPTION_EXEC_PROFILING_BPPONIT_OPTIONS = ge::OPTION_EXEC_PROFILING_BPPONIT_OPTIONS;
 
 void PluginInit(std::map<std::string, std::string> &init_options);
 
 void PluginFinalize();
 
+int32_t RdmaInitAndRegister(const std::vector<ge::HostVarInfo> &var_info, size_t size);
+
+int32_t GetVarAddrAndSize(const std::string &var_name, uint64_t &base_addr, uint64_t &var_size);
+
+int32_t MallocSharedMem(const ge::TensorInfo &tensor_info, uint64_t &dev_addr, uint64_t &memory_size);
+
 #endif  // TENSORFLOW_NPU_PLUGIN_H_
\ No newline at end of file
diff --git a/tf_adapter/util/session_manager.cc b/tf_adapter/util/session_manager.cc
index 8faffdf598363c2568daf31655cebe7a285624a3..b8b98d99faed8a0d1b344f06ad786de9042bb2ab 100644
--- a/tf_adapter/util/session_manager.cc
+++ b/tf_adapter/util/session_manager.cc
@@ -67,6 +67,7 @@ void SessionManager::DestroyGeSession(const std::string &tf_session) {
   if (it != ge_sessions_.end()) {
     if (it->second != nullptr) {
       LOG(INFO) << "find ge session connect with tf session " << tf_session;
+      ge_graphs_.erase(it->second);
       delete it->second;
       it->second = nullptr;
     }
@@ -118,6 +119,13 @@ void SessionManager::PrintGeSessionOptions(std::map<std::string, std::string> &s
     sess_options.erase(ge::VARIABLE_MEMORY_MAX_SIZE);
   }
 
+  // tailing optimization
+  LOG(INFO) << "[GEOP] is_tailing_optimization : " << sess_options["ge.exec.isTailingOptimization"];
+
+  LOG(INFO) << "[GEOP] op_select_implmode : " << sess_options[ge::OP_SELECT_IMPL_MODE];
+
+  LOG(INFO) << "[GEOP] optypelist_for_implmode : " << sess_options[ge::OPTYPELIST_FOR_IMPLMODE];
+
   // reuse memory env
   const char *disable_reuse_memory = std::getenv("DISABLE_REUSE_MEMORY");
   if (disable_reuse_memory == nullptr) {
@@ -132,6 +140,34 @@ void SessionManager::PrintGeSessionOptions(std::map<std::string, std::string> &s
             << ", dump_path :" << sess_options[ge::OPTION_EXEC_DUMP_PATH]
             << ", dump_step :" << (dump_step.empty() ? "NA" : dump_step)
             << ", dump_mode :" << sess_options[ge::OPTION_EXEC_DUMP_MODE]
-            << ", enable_dump_enable :" << sess_options[ge::OPTION_EXEC_ENABLE_DUMP_DEBUG]
+            << ", enable_dump_debug :" << sess_options[ge::OPTION_EXEC_ENABLE_DUMP_DEBUG]
             << ", dump_debug_mode :" << sess_options[ge::OPTION_EXEC_DUMP_DEBUG_MODE];
+
+  // dynamic input config
+  LOG(INFO) << "[GEOP] input_shape :" << sess_options["ge.inputShape"]
+            << ", dynamic_dims :" << sess_options["ge.dynamicDims"];
+}
+
+bool SessionManager::CacheGeGraphs(ge::Session *ge_session, ge::Graph &ge_graph) {
+  if (ge_session == nullptr) {
+    LOG(ERROR) << "ge session is null ptr.";
+    return false;
+  }
+  ge_graphs_[ge_session].push_back(ge_graph);
+  return true;
+}
+
+bool SessionManager::GetGeGraphs(ge::Session *ge_session, std::vector<ge::Graph> &ge_graphs) {
+  if (ge_session == nullptr) {
+    LOG(ERROR) << "ge session is null ptr.";
+    return false;
+  }
+  auto it = ge_graphs_.find(ge_session);
+  if (it != ge_graphs_.end()) {
+    ge_graphs = it->second;
+    LOG(INFO) << " get ge session nontraining graphs success.";
+    return true;
+  }
+  LOG(ERROR) << "ge session get nontraining graphs failed.";
+  return false;
 }
\ No newline at end of file
diff --git a/tf_adapter/util/session_manager.h b/tf_adapter/util/session_manager.h
index 6fbdd18668c8680ea9d3f6f0631a083d08481610..9fccd6650e11819c287a36389be034e6912141ba 100644
--- a/tf_adapter/util/session_manager.h
+++ b/tf_adapter/util/session_manager.h
@@ -52,6 +52,12 @@ class SessionManager {
   // Whether a ge session exist.
   bool IsGeSessionExist();
 
+  // get ge session nontraining graphs
+  bool GetGeGraphs(ge::Session *ge_session, std::vector<ge::Graph> &ge_graphs);
+
+  // cache ge session nontraining graphs
+  bool CacheGeGraphs(ge::Session *ge_session, ge::Graph &ge_graph);
+
  private:
   // Create a ge session to run the compute graph divided by tf session.
   bool CreateGeSession(const std::string &session, ge::Session *&ge_session,
@@ -60,5 +66,7 @@ class SessionManager {
   void PrintGeSessionOptions(std::map<std::string, std::string> &sess_options);
   // Mapping relationship between tf session and ge session.
   std::unordered_map<std::string, ge::Session *> ge_sessions_;
+  // cache ge nontraining graphs
+  std::unordered_map<ge::Session *, std::vector<ge::Graph>> ge_graphs_;
 };
 #endif