From f8f35673af34c25564755387d41da6934c35bc69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=BC=BA?= <liuqiang238@hisilicon.com>
Date: Wed, 15 Mar 2023 09:39:41 +0000
Subject: [PATCH 01/22] =?UTF-8?q?!2106=20OBP=20=E5=95=86=E5=88=86=E5=88=A0?=
 =?UTF-8?q?=E9=99=A4embedding=20Merge=20pull=20request=20!2106=20from=20?=
 =?UTF-8?q?=E5=88=98=E5=BC=BA/lq=5F315?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/interface_spec/api_npu_config.pyh  |   2 +-
 tf_adapter/kernels/geop_npu.cc                |  20 +-
 tf_adapter/kernels/geop_npu.h                 |   4 -
 .../optimizers/om_partition_subgraphs_pass.cc |  19 -
 .../python/npu_bridge/embedding/__init__.py   |  23 --
 .../embedding/embedding_optimizer.py          | 158 --------
 .../embedding/embedding_resource.py           |  39 --
 .../npu_bridge/embedding/embedding_service.py | 363 ------------------
 .../embedding/embedding_table_map_policy.py   | 119 ------
 .../python/npu_bridge/embedding/tf_path.py    |  64 ---
 .../npu_bridge/estimator/npu/npu_config.py    |   3 -
 .../npu_bridge/estimator/npu/npu_estimator.py |   2 -
 .../python/npu_bridge/npu_cpu/npu_cpu_ops.py  |  99 -----
 tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt  |  18 -
 .../testcase/get_attr_optimize_pass_test.cc   |   3 -
 tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt  |  24 --
 .../testcase/get_attr_optimize_pass_test.cc   |   3 -
 tf_adapter/util/npu_attrs.cc                  |  15 -
 18 files changed, 2 insertions(+), 976 deletions(-)
 delete mode 100644 tf_adapter/python/npu_bridge/embedding/__init__.py
 delete mode 100644 tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py
 delete mode 100644 tf_adapter/python/npu_bridge/embedding/embedding_resource.py
 delete mode 100644 tf_adapter/python/npu_bridge/embedding/embedding_service.py
 delete mode 100644 tf_adapter/python/npu_bridge/embedding/embedding_table_map_policy.py
 delete mode 100644 tf_adapter/python/npu_bridge/embedding/tf_path.py

diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh
index c89bc3c6b..718db2ac6 100644
--- a/tf_adapter/interface_spec/api_npu_config.pyh
+++ b/tf_adapter/interface_spec/api_npu_config.pyh
@@ -19,7 +19,7 @@ class NPURunConfig(run_config_lib.RunConfig):
                 soc_config=None, hccl_timeout=None, op_wait_timeout=None, op_execute_timeout=None, HCCL_algorithm=None,
                 customize_dtypes=None, op_debug_config=None, memory_config=None, experimental_config=None,
                 topo_sorting_mode=None, aoe_config_file=None, insert_op_file=None, stream_sync_timeout=-1,
-                event_sync_timeout=-1, external_weight=False, es_cluster_config=None, deterministic=0,
+                event_sync_timeout=-1, external_weight=False, deterministic=0,
                 frozen_variable=False, variable_placement="Device"):
 
 class ProfilingConfig():
diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc
index e2fdb4301..fc8538f00 100644
--- a/tf_adapter/kernels/geop_npu.cc
+++ b/tf_adapter/kernels/geop_npu.cc
@@ -326,10 +326,6 @@ void GeOp::Initialize(OpKernelConstruction *ctx) {
   }
 
   ctx->GetAttr("_recompute_mode", &recompute_mode_);
-  ctx->GetAttr("_deploy_inject_config", &deploy_inject_config_);
-  ctx->GetAttr("_execute_times", &execute_times_);
-  ctx->GetAttr("_max_num", &max_num_);
-  ctx->GetAttr("_embedding_dim", &embedding_dim_);
   ctx->GetAttr("_dynamic_input", &dynamic_input_);
   if (!dynamic_input_.empty() && dynamic_input_ == "1") {
     jit_compile_ = true;
@@ -349,9 +345,7 @@ void GeOp::Initialize(OpKernelConstruction *ctx) {
                 << ", getnext_inputs_shape_range: " << getnext_inputs_shape_range_
                 << ", data_inputs_shape_range: " << data_inputs_shape_range_ << ", is_train_graph: " << is_train_graph_
                 << ", is_dynamic_getnext: " << is_dynamic_getnext_ << ", placeholder_index: " << placeholder_index_
-                << ", is_var_init_graph: " << is_var_init_graph_ << ", deploy_inject_config: " << deploy_inject_config_
-                << ", execute_times: " << execute_times_ << ", max_num: " << max_num_
-                << ", embedding_dim: " << embedding_dim_;
+                << ", is_var_init_graph: " << is_var_init_graph_;
 
   // global environment Initialize, invoke once for each process
   std::string sess_config = "";
@@ -865,18 +859,6 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
     if (!recompute_mode_.empty()) {
       graph_options_["ge.recompute"] = recompute_mode_;
     }
-    if (!deploy_inject_config_.empty()) {
-      graph_options_["ge.exec.clusterSpec"] = deploy_inject_config_;
-    }
-    if (!execute_times_.empty()) {
-      graph_options_["ge.execute_times"] = execute_times_;
-    }
-    if (!max_num_.empty()) {
-      graph_options_["ge.max_num"] = max_num_;
-    }
-    if (!embedding_dim_.empty()) {
-      graph_options_["ge.embedding_dim"] = embedding_dim_;
-    }
     SetDynamicInput();
     graph_options_["ge.exec.isVarInitGraph"] = is_var_init_graph_;
     graph_options_["ge.jit_compile"] = jit_compile_ ? "1" : "0";
diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h
index 93a468146..f2b970c1b 100644
--- a/tf_adapter/kernels/geop_npu.h
+++ b/tf_adapter/kernels/geop_npu.h
@@ -190,10 +190,6 @@ private:
   std::atomic_flag tuned_flag_;
   std::vector<std::pair<Tensor, int32_t>> remove_index_;
   std::string is_var_init_graph_;
-  std::string deploy_inject_config_;
-  std::string execute_times_;
-  std::string max_num_;
-  std::string embedding_dim_;
   std::string recompute_mode_;
   std::vector<absl::optional<PartialTensorShape>> input_shapes_vec_;
   bool jit_compile_;
diff --git a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc
index 13588e8c9..bb54e52d8 100644
--- a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc
+++ b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc
@@ -2000,10 +2000,6 @@ void OMPartitionSubgraphsPass::GetGraphConfig(const Node &node, bool enable_dp,
   const std::string kDynamicInputsShapeRange = "_graph_dynamic_inputs_shape_range";
   const std::string kIsTrainGraph = "_is_train_graph";
   const std::string kRecomputeMode = "_recompute_mode";
-  const std::string kDeployInjectConfig = "_deploy_inject_config";
-  const std::string kExecuteTimes = "_execute_times";
-  const std::string kMaxNum = "_max_num";
-  const std::string kEmbeddingDim = "_embedding_dim";
   if (node_attrs.find(kDynamicInput) != node_attrs.end()) {
     bool dynamic_input = node_attrs.at(kDynamicInput).b();
     graph_options["dynamic_input"] = std::to_string(static_cast<int32_t>(dynamic_input));
@@ -2024,21 +2020,6 @@ void OMPartitionSubgraphsPass::GetGraphConfig(const Node &node, bool enable_dp,
     std::string recompute_mode = node_attrs.at(kRecomputeMode).s();
     graph_options["recompute_mode"] = recompute_mode;
   }
-  if (node_attrs.find(kDeployInjectConfig) != node_attrs.end()) {
-    graph_options["deploy_inject_config"] = node_attrs.at(kDeployInjectConfig).s();
-  }
-  if (node_attrs.find(kExecuteTimes) != node_attrs.end()) {
-    const auto execute_times = node_attrs.at(kExecuteTimes).i();
-    graph_options["execute_times"] = std::to_string(static_cast<const int32_t>(execute_times));
-  }
-  if (node_attrs.find(kMaxNum) != node_attrs.end()) {
-    const auto max_num = node_attrs.at(kMaxNum).i();
-    graph_options["max_num"] = std::to_string(static_cast<const int32_t>(max_num));
-  }
-  if (node_attrs.find(kEmbeddingDim) != node_attrs.end()) {
-    const auto embedding_dim = node_attrs.at(kEmbeddingDim).i();
-    graph_options["embedding_dim"] = std::to_string(static_cast<const int32_t>(embedding_dim));
-  }
 }
 
 Status OMPartitionSubgraphsPass::ProcessGetNext(Node &node, const std::string enable_dp,
diff --git a/tf_adapter/python/npu_bridge/embedding/__init__.py b/tf_adapter/python/npu_bridge/embedding/__init__.py
deleted file mode 100644
index f4cdb646b..000000000
--- a/tf_adapter/python/npu_bridge/embedding/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-
-from npu_bridge.embedding.embedding_optimizer import AdamOptimizer as EmbeddingAdamOptimizer
-from npu_bridge.embedding.embedding_optimizer import AdagradOptimizer as EmbeddingAdagradOptimizer
-from npu_bridge.embedding.embedding_service import ESWorker as EmbeddingService
-from npu_bridge.embedding.tf_path import path_on_tf
-path_on_tf()
\ No newline at end of file
diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py b/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py
deleted file mode 100644
index 0d596fed7..000000000
--- a/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from tensorflow.python.framework import ops
-from tensorflow.python.eager import context
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.training import adam
-from tensorflow.python.training import adagrad
-from tensorflow.python.training import training_ops
-from tensorflow.python.training import training_util
-from npu_bridge.embedding.embedding_resource import NpuEmbeddingResource
-from npu_bridge.npu_cpu.npu_cpu_ops import gen_npu_cpu_ops
-
-_GLOBAL_STEP_VALUE = 1
-
-
-class AdamOptimizer(adam.AdamOptimizer):
-    @property
-    def embedding_dims(self):
-        return self._embedding_dims
-
-    @embedding_dims.setter
-    def embedding_dims(self, val):
-        self._embedding_dims = val
-
-    def _prepare(self):
-        lr = self._call_if_callable(self._lr)
-        epsilon = self._call_if_callable(self._epsilon)
-        self._beta1_t_list = []
-        self._beta2_t_list = []
-        self._lr_t = ops.convert_to_tensor(lr, name="learning_rate")
-        self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")
-
-    def _resource_apply_sparse(self, grad, var, indices):
-        if isinstance(var, NpuEmbeddingResource):
-            beta1 = self._call_if_callable(self._beta1)
-            beta2 = self._call_if_callable(self._beta2)
-            self._beta1_t = ops.convert_to_tensor(beta1, name="beta1" + str(self.table_idx))
-            self._beta2_t = ops.convert_to_tensor(beta2, name="beta2" + str(self.table_idx))
-            self._beta1_t_list.append(self._beta1_t)
-            self._beta2_t_list.append(self._beta2_t)
-            beta1_power, beta2_power = self._get_beta_accumulators()
-            self.table_idx += 1
-            return gen_npu_cpu_ops.embedding_apply_adam(var.handle, beta1_power, beta2_power,
-                                                        math_ops.cast(self._lr_t, grad.dtype),
-                                                        math_ops.cast(self._beta1_t, grad.dtype),
-                                                        math_ops.cast(self._beta2_t, grad.dtype),
-                                                        math_ops.cast(self._epsilon_t, grad.dtype),
-                                                        grad,
-                                                        indices,
-                                                        ops.convert_to_tensor(_GLOBAL_STEP_VALUE),
-                                                        self._embedding_dims)
-        else:
-            return self._apply_sparse_shared(grad, var, indices, self._resource_scatter_add)
-
-    def _create_slots(self, var_list):
-        self.table_num = 0
-        self.table_idx = 0
-        first_var = min(var_list, key=lambda x: x.name)
-        for idx in range(len(var_list)):
-            self._create_non_slot_variable(
-                initial_value=self._beta1, name="beta1_power" + str(idx), colocate_with=first_var)
-            self._create_non_slot_variable(
-                initial_value=self._beta2, name="beta2_power" + str(idx), colocate_with=first_var)
-            self.table_num += 1
-
-        for v in var_list:
-            if not isinstance(v, NpuEmbeddingResource):
-                self._zeros_slot(v, "m", self._name)
-                self._zeros_slot(v, "v", self._name)
-
-    def _get_beta_accumulators(self):
-        with ops.init_scope():
-            if context.executing_eagerly():
-                graph = None
-            else:
-                graph = ops.get_default_graph()
-        return (self._get_non_slot_variable("beta1_power" + str(self.table_idx), graph=graph),
-                self._get_non_slot_variable("beta2_power" + str(self.table_idx), graph=graph))
-
-    def _finish(self, update_ops, name_scope):
-        # Update the power accumulators.
-        self.table_num = 0
-        self.table_idx = 0
-        finish_output = []
-        with ops.control_dependencies(update_ops):
-            beta1_power_list = []
-            beta2_power_list = []
-            for k in update_ops:
-                beta1_power, beta2_power = self._get_beta_accumulators()
-                beta1_power_list.append(beta1_power)
-                beta2_power_list.append(beta2_power)
-                self.table_idx += 1
-        for idx in range(len(update_ops)):
-            beta1_power = beta1_power_list[idx]
-            beta2_power = beta2_power_list[idx]
-            with ops.colocate_with(beta1_power):
-                update_beta1 = beta1_power.assign(
-                    beta1_power * self._beta1_t_list[idx], use_locking=self._use_locking)
-                update_beta2 = beta2_power.assign(
-                    beta2_power * self._beta2_t_list[idx], use_locking=self._use_locking)
-            new_update_op = []
-            new_update_op.append(update_ops[idx])
-            finish_output.append(control_flow_ops.group(
-                *new_update_op + [update_beta1, update_beta2], name=name_scope + str(idx)))
-        return finish_output
-
-
-class AdagradOptimizer(adagrad.AdagradOptimizer):
-    @property
-    def embedding_dims(self):
-        return self._embedding_dims
-
-    @embedding_dims.setter
-    def embedding_dims(self, val):
-        self._embedding_dims = val
-
-    def _resource_apply_sparse(self, grad, var, indices):
-        if isinstance(var, NpuEmbeddingResource):
-            return gen_npu_cpu_ops.embedding_apply_ada_grad(var.handle,
-                                                           math_ops.cast(self._learning_rate_tensor, grad.dtype),
-                                                           grad,
-                                                           indices,
-                                                           ops.convert_to_tensor(_GLOBAL_STEP_VALUE),
-                                                           self._embedding_dims)
-        else:
-            return self.training_ops.resource_sparse_apply_adagrad(var.handle, grad.handle,
-                                                                   math_ops.cast(self._learning_rate_tensor,
-                                                                                 grad.dtype),
-                                                                   grad, indices,
-                                                                   use_locking=self._use_locking)
-
-    def _create_slots(self, var_list):
-        for v in var_list:
-            if not isinstance(v, NpuEmbeddingResource):
-                dtype = v.dtype.base_dtype
-                if v.get_shape().is_fully_defined():
-                    init = init_ops.constant_initializer(self._initial_accumulator_value,
-                                                         dtype=dtype)
-                else:
-                    init = self._init_constant_op(v, dtype)
-                self._get_or_make_slot_with_initializer(v, init, v.get_shape(), dtype,
-                                                        "accumulator", self._name)
diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_resource.py b/tf_adapter/python/npu_bridge/embedding/embedding_resource.py
deleted file mode 100644
index fdfc288bc..000000000
--- a/tf_adapter/python/npu_bridge/embedding/embedding_resource.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from tensorflow.python.framework import ops
-from npu_bridge.npu_cpu.npu_cpu_ops import gen_npu_cpu_ops
-
-
-class NpuEmbeddingResource:
-
-    def __init__(self, table_id):
-        self.name = table_id
-        self._tensor = gen_npu_cpu_ops.table_to_resource(ops.convert_to_tensor(table_id))
-
-    @property
-    def handle(self):
-        return self._tensor
-
-    @property
-    def graph(self):
-        return self._tensor.graph
-
-    @property
-    def op(self):
-        return self._tensor.op
-
diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_service.py b/tf_adapter/python/npu_bridge/embedding/embedding_service.py
deleted file mode 100644
index a661e11c9..000000000
--- a/tf_adapter/python/npu_bridge/embedding/embedding_service.py
+++ /dev/null
@@ -1,363 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import json
-import contextlib
-import os
-import math
-import tensorflow as tf
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.core.framework import attr_value_pb2
-from npu_bridge.npu_cpu.npu_cpu_ops import gen_npu_cpu_ops
-from npu_bridge.embedding.embedding_resource import NpuEmbeddingResource
-from npu_bridge.embedding import embedding_optimizer
-from npu_bridge.embedding.embedding_table_map_policy import NoneTableMapPolicy, AutoMergeTableMapPolicy
-
-_INT32_MAX_VALUE = 2147483647
-
-
-@contextlib.contextmanager
-def specified_ps_engine_scope():
-    """
-    Enable the non npu compilation of operators within the scope.
-    """
-    attrs = {
-        "_process_node_engine_id": attr_value_pb2.AttrValue(s=tf.compat.as_bytes("PS"))
-    }
-    with ops.get_default_graph()._attr_scope(attrs):
-        yield
-
-
-class ESWorker:
-    """ Embedding service class. """
-
-    def __init__(self, config_from_param=None):
-        env_dist = os.environ
-        cluster_config_from_env = env_dist.get("ESCLUSTER_CONFIG_PATH")
-        if cluster_config_from_env is None:
-            if config_from_param is None:
-                raise ValueError("EsClusterConfig and env variable are both null.")
-            es_cluster_config = config_from_param
-        else:
-            es_cluster_config = cluster_config_from_env
-        with open(es_cluster_config, encoding='utf-8') as a:
-            es_cluster_config_json = json.load(a)
-            self._es_cluster_conf = json.dumps(es_cluster_config_json)
-            self._ps_num = int(es_cluster_config_json["psNum"])
-            self._embedding_dim = -1
-            self._max_num = -1
-            self._ps_ids = []
-            self._ps_ids_list = es_cluster_config_json["psCluster"]
-            self._init_embedding_hash_maps = {}
-            self._init_partition_maps = {}
-            self._table_to_embedding_dim = {}
-            for each_ps in self._ps_ids_list:
-                self._ps_ids.append(each_ps["id"])
-        self._train_mode = True
-        self._train_level = False
-        self._optimizer = None
-        self.slot_vars_num = None
-        self._initializer = None
-        self._init_flag = False
-        self._table_has_init = []
-        self.user_defined_table_infos = []
-        self.table_map_policy = None
-        self.table_create_infos = []
-        self.total_variable_table = []
-        self.total_embedding_count = 0
-        config = tf.ConfigProto()
-        custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
-        custom_op.name = "NpuOptimizer"
-        custom_op.parameter_map["es_cluster_config"].s = tf.compat.as_bytes(self._es_cluster_conf)
-        self.es_all_config = config
-
-    # 提供embedding init功能
-    # @param vocabulary_size int 类型
-    # @param file_path string 类型
-    # @param file_name string 类型
-    # @param table_id int32 类型
-    # @param max_batch_size int32 类型
-    # @param optimizer 类型
-    # @param initializer string 类型
-    # @param embedding_dim int32 类型
-    # @param only_var bool 类型
-    # @param mode string 类型
-    # @param partition_num int 类型
-    def embedding_init(self, vocabulary_size, file_path, file_name, table_id, max_batch_size, optimizer=None,
-                       initializer=None, embedding_dim=-1, only_var=False, mode="bin", partition_num=65537):
-        """ Operator for init embedding table. """
-        if vocabulary_size is None or table_id is None or max_batch_size is None or embedding_dim is None:
-            raise ValueError("vocabulary_size or table_id or max_batch_size or embedding_dim is None.")
-        if (not isinstance(vocabulary_size, int)) or (not isinstance(table_id, int)) or \
-                (not isinstance(max_batch_size, int)) or (not isinstance(embedding_dim, int)):
-            raise ValueError("vocabulary_size, table_id, max_batch_size and embedding_dim must be int.")
-        if vocabulary_size < 0 or table_id < 0:
-            raise ValueError("vocabulary_size and table_id can not be smaller than zero.")
-        if vocabulary_size >= _INT32_MAX_VALUE or table_id >= _INT32_MAX_VALUE:
-            raise ValueError("vocabulary_size or table_id exceed int32 max value.")
-        if embedding_dim <= 0 or partition_num <= 0 or max_batch_size <= 0:
-            raise ValueError("embedding_dim, partition_num and max_batch_size must be greater than zero.")
-        if table_id in self._table_has_init:
-            raise ValueError("this table has already initialized.")
-        self._embedding_dim = embedding_dim
-        self._max_num = max_batch_size
-        self._table_to_embedding_dim[table_id] = embedding_dim
-        self._initializer = initializer
-        self._table_has_init.append(table_id)
-        bucket_size = math.ceil(vocabulary_size / self._ps_num)
-        if optimizer is None:
-            if file_path is None or file_name is None or (not tf.gfile.Exists(os.path.join(file_path, file_name))):
-                raise ValueError("embedding table file not exist.")
-            self._train_mode = False
-            self.slot_vars_num = 0
-        else:
-            if (not isinstance(optimizer, embedding_optimizer.AdamOptimizer) and
-                    not isinstance(optimizer, embedding_optimizer.AdagradOptimizer)):
-                raise ValueError(
-                    "optimizer should be embedding_optimizer.AdamOptimizer or embedding_optimizer.AdagradOptimizer")
-            if (initializer is not None) and (initializer is not 'random_uniform') and \
-                    (initializer is not 'truncated_normal'):
-                raise ValueError("initializer must be random_uniform or truncated_normal.")
-            self._optimizer = optimizer
-            self._optimizer._embedding_dims = embedding_dim
-            # adam include m and v, 2 slots; adagrad include accumulator, 1 slot
-            self.slot_vars_num = 2 if isinstance(self._optimizer, embedding_optimizer.AdamOptimizer) else 1
-        if (file_path is None) or (file_name is None) or (not tf.gfile.Exists(os.path.join(file_path, file_name))):
-            if initializer is None:
-                raise ValueError("In new embedding training, initializer can not be None.")
-            self._train_level = True
-        with specified_ps_engine_scope():
-            self._init_partition_maps[table_id] = \
-                gen_npu_cpu_ops.init_partition_map(ps_num=ops.convert_to_tensor(self._ps_num),
-                                                   ps_ids=ops.convert_to_tensor(self._ps_ids),
-                                                   partition_num=partition_num)
-            self._init_partition_maps.get(table_id)._set_attr("_execute_times", attr_value_pb2.AttrValue(i=1))
-            self._init_partition_maps.get(table_id)._set_attr("_embedding_dim",
-                                                              attr_value_pb2.AttrValue(i=self._embedding_dim))
-            self._init_partition_maps.get(table_id)._set_attr("_max_num", attr_value_pb2.AttrValue(i=self._max_num))
-            self._init_partition_maps.get(table_id)._set_attr("_deploy_inject_config",
-                                                              attr_value_pb2.AttrValue(
-                                                                  s=tf.compat.as_bytes(self._es_cluster_conf)))
-            return self._init_hashmap_and_table_import(bucket_size, file_path, file_name, table_id,
-                                                       initializer, embedding_dim, only_var, mode)
-
-    # 提供embedding lookup功能
-    # @param table_id int32 类型
-    # @param input_ids int64 类型
-    # @return values float32 类型
-    def embedding_lookup(self, table_id, input_ids):
-        """ Operator for look up in embedding table. """
-        if (table_id is None) or (input_ids is None):
-            raise ValueError("table_id or input_ids must be specified.")
-        if not isinstance(table_id, int):
-            raise ValueError("type of table_id must be int.")
-        if input_ids.dtype != tf.int64:
-            raise ValueError("dtype of input_ids must be tf.int64.")
-        if table_id < 0:
-            raise ValueError("table_id can not be smaller than zero.")
-        if not self._init_flag:
-            raise ValueError("embedding must init first!")
-        if table_id not in self._table_has_init:
-            raise ValueError("this table has not yet initialized.")
-        if self._train_mode:
-            seed1, seed2 = random_seed.get_seed(None)
-            result = gen_npu_cpu_ops.embedding_table_find_and_init(table_id=ops.convert_to_tensor(table_id),
-                                                                   keys=input_ids,
-                                                                   embedding_dim=
-                                                                   self._table_to_embedding_dim.get(table_id),
-                                                                   random_alg=self._initializer,
-                                                                   seed=seed1, seed2=seed2,
-                                                                   value_total_len=
-                                                                   self._table_to_embedding_dim.get(table_id) *
-                                                                   (self.slot_vars_num + 1)
-                                                                   )
-        else:
-            result = gen_npu_cpu_ops.embedding_table_find(table_id=ops.convert_to_tensor(table_id),
-                                                          keys=input_ids,
-                                                          embedding_dim=self._table_to_embedding_dim.get(table_id))
-        result.op._set_attr("_embedding_dim", attr_value_pb2.AttrValue(i=self._embedding_dim))
-        result.op._set_attr("_max_num", attr_value_pb2.AttrValue(i=self._max_num))
-        result.op._set_attr("_deploy_inject_config",
-                            attr_value_pb2.AttrValue(s=tf.compat.as_bytes(self._es_cluster_conf)))
-        return result
-
-    # 提供embedding update功能
-    # @param loss 类型
-    # @param params float32 类型
-    # @param table_ids int32 类型
-    # @param input_ids_list int64 类型
-    def embedding_update(self, loss, params, table_ids, input_ids_list):
-        """ Operator for update in embedding table. """
-        if (loss is None) or (params is None) or (table_ids is None) or (input_ids_list is None):
-            raise ValueError("loss or params or table_ids or input_ids_list is None.")
-        if (isinstance(loss, str)) or (isinstance(params, str)) or isinstance(table_ids, str) or \
-                isinstance(input_ids_list, str):
-            raise ValueError("loss, params, table_ids and input_ids_list can not be str.")
-        if not self._init_flag:
-            raise ValueError("embedding must init first!")
-        if (not isinstance(params, (list, tuple)) and not isinstance(table_ids, (list, tuple))
-                and not isinstance(input_ids_list, (list, tuple))):
-            params = [params]
-            table_ids = [table_ids]
-            input_ids_list = [input_ids_list]
-        for table_id in table_ids:
-            if table_id not in self._table_has_init:
-                raise ValueError("this table has not yet initialized.")
-        if (len(params) != len(table_ids)) or (len(params) != len(input_ids_list)) \
-                or (len(table_ids) != len(input_ids_list)):
-            raise ValueError("The length of params, table_ids, input_ids_list should be equal.")
-        embedding_grads = tf.gradients(loss, params)
-        params_grads = []
-        for i in range(len(embedding_grads)):
-            params_grads.append(tf.IndexedSlices(embedding_grads[i], input_ids_list[i], dense_shape=params[i].shape))
-        with specified_ps_engine_scope():
-            var_refs = [NpuEmbeddingResource(table_id) for table_id in table_ids]
-            update_op = self._optimizer.apply_gradients(list(zip(params_grads, var_refs)))
-            return update_op
-
-    # 提供训练好的embedding values save功能
-    # @param file_path string 类型
-    # @param file_name string 类型
-    # @param table_id int32 类型
-    # @param mode string 类型
-    def embedding_save(self, file_path, file_name, table_id, mode="bin"):
-        """ Operator for save values in embedding table. """
-        if file_path is None or file_name is None or table_id is None:
-            raise ValueError("table_id, embedding table file_name and file_path can not be None.")
-        if table_id not in self._table_has_init:
-            raise ValueError("this table has not yet initialized.")
-        if not os.path.exists(file_path):
-            os.mkdir(file_path)
-        with specified_ps_engine_scope():
-            embedding_dim = self._table_to_embedding_dim.get(table_id)
-            return gen_npu_cpu_ops.embedding_table_export(file_path, file_name, ops.convert_to_tensor(-1), table_id,
-                                                          embedding_dim, embedding_dim, True, mode)
-
-    # 提供训练好的embedding values + 调优参数 save功能
-    # @param file_path string 类型
-    # @param file_name string 类型
-    # @param table_id int32 类型
-    # @param mode string 类型
-    def embedding_ckpt_save(self, file_path, file_name, table_id, mode="bin"):
-        """ Operator for save values and optimizer params in embedding table. """
-        if file_path is None or file_name is None or table_id is None:
-            raise ValueError("table_id, embedding table file_name and file_path can not be None.")
-        if table_id not in self._table_has_init:
-            raise ValueError("this table has not yet initialized.")
-        if not os.path.exists(file_path):
-            os.mkdir(file_path)
-        with specified_ps_engine_scope():
-            embedding_dim = self._table_to_embedding_dim.get(table_id)
-            return gen_npu_cpu_ops.embedding_table_export(file_path, file_name, ops.convert_to_tensor(-1), table_id,
-                                                          embedding_dim, embedding_dim * (self.slot_vars_num + 1),
-                                                          False, mode)
-
-    def data_parallel_embedding(self, max_vocabulary_size, embedding_dim, multihot_lens, allow_merge=True):
-        if not isinstance(multihot_lens, list):
-            raise ValueError("multihot_lens must be list.")
-        new_table_info = dict(
-            max_vocabulary_size=max_vocabulary_size,
-            embedding_dim=embedding_dim,
-            multihot_lens=multihot_lens,
-            allow_merge=allow_merge
-        )
-        self.user_defined_table_infos.append(new_table_info)
-
-    def init_table(self, table_map_policy=AutoMergeTableMapPolicy()):
-        self.table_map_policy = table_map_policy
-        self.table_create_infos = self.table_map_policy.map_table_infos(self.user_defined_table_infos)
-        for table_info_ in self.table_create_infos:
-            self.total_variable_table.append(tf.Variable(
-                tf.random_normal([table_info_['max_vocabulary_size'], table_info_['embedding_dim']], mean=0.0,
-                                 stddev=1.0, dtype=tf.float32, seed=1234)
-            ))
-            self.total_embedding_count += 1
-
-    def embeddings_look_up(self, tf_indices):
-        if self.total_embedding_count != len(self.table_create_infos):
-            raise ValueError("Must init_table() first!")
-        (in_slot_size_group, slot_to_table, table_to_input_group, \
-         table_to_slot, table_to_output_slots) = \
-            (self.table_map_policy.in_slot_size_group, self.table_map_policy.slot_to_table, \
-             self.table_map_policy.table_to_input_groups, self.table_map_policy.table_to_slot, \
-             self.table_map_policy.table_to_output_slots)
-
-        indices_split = tf.split(tf_indices, in_slot_size_group, axis=1)
-        for tid in range(self.total_embedding_count):
-            table_to_input_group[tid] = []
-        for sid, indices in enumerate(indices_split):
-            tid = slot_to_table[sid]
-            table_to_input_group[tid].append(indices)
-
-        output_slots = [None for _ in in_slot_size_group]
-        for tid, table_input_group in enumerate(table_to_input_group):
-            table_input_after_mapping = \
-                gen_npu_cpu_ops.embedding_feature_mapping(feature_id=tf.concat(table_input_group, axis=1))
-            table_to_input_group[tid] = table_input_after_mapping
-            table_embedding = tf.nn.embedding_lookup(self.total_variable_table[tid], table_input_after_mapping)
-            out_embedding_splited = tf.split(table_embedding, table_to_output_slots[tid], axis=1)
-            for out_emb, sid in zip(out_embedding_splited, table_to_slot[tid]):
-                output_slots[sid] = out_emb
-        return output_slots
-
-    def _init_hashmap_and_table_import(self, bucket_size, file_path, file_name, table_id,
-                                       initializer, embedding_dim, only_var, mode):
-        with tf.control_dependencies([self._init_partition_maps.get(table_id)]):
-            if self._train_mode:
-                if self._train_level:
-                    seed1, seed2 = random_seed.get_seed(None)
-                    self._init_embedding_hash_maps[table_id] = \
-                        gen_npu_cpu_ops.init_embedding_hashmap(table_id=ops.convert_to_tensor(table_id),
-                                                               bucket_size=bucket_size,
-                                                               value_total_len=embedding_dim * (self.slot_vars_num + 1),
-                                                               embedding_dim=embedding_dim,
-                                                               random_alg=initializer, seed=seed1, seed2=seed2)
-                else:
-                    self._init_embedding_hash_maps[table_id] = \
-                        gen_npu_cpu_ops.init_embedding_hashmap(table_id=ops.convert_to_tensor(table_id),
-                                                               bucket_size=bucket_size,
-                                                               value_total_len=embedding_dim * (self.slot_vars_num + 1),
-                                                               embedding_dim=embedding_dim,
-                                                               random_alg=None, seed=None, seed2=None)
-            else:
-                self._init_embedding_hash_maps[table_id] = \
-                    gen_npu_cpu_ops.init_embedding_hashmap(table_id=ops.convert_to_tensor(table_id),
-                                                           bucket_size=bucket_size,
-                                                           value_total_len=embedding_dim,
-                                                           embedding_dim=embedding_dim,
-                                                           random_alg=None, seed=None, seed2=None)
-        self._init_flag = True
-        return self._init_or_restore(file_path, file_name, table_id, embedding_dim, only_var, mode)
-
-    def _init_or_restore(self, file_path, file_name, table_id, embedding_dim, only_var, mode):
-        if self._train_mode and self._train_level:
-            return tf.group(
-                [tf.initializers.variables(self._optimizer.variables()), self._init_embedding_hash_maps.get(table_id)])
-        # restore embedding table
-        with tf.control_dependencies([self._init_embedding_hash_maps.get(table_id)]):
-            embedding_table_import = gen_npu_cpu_ops.embedding_table_import(
-                file_path=ops.convert_to_tensor(file_path),
-                file_name=ops.convert_to_tensor(file_name),
-                # ps_id will be changed in executor, so can not be optimized in graph
-                ps_id=ops.convert_to_tensor(-1),
-                table_id=ops.convert_to_tensor(table_id),
-                embedding_dim=embedding_dim,
-                value_total_len=embedding_dim * (self.slot_vars_num + 1),
-                only_var_flag=only_var,
-                file_type=mode)
-        return tf.group([embedding_table_import])
diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_table_map_policy.py b/tf_adapter/python/npu_bridge/embedding/embedding_table_map_policy.py
deleted file mode 100644
index c9cf6be1c..000000000
--- a/tf_adapter/python/npu_bridge/embedding/embedding_table_map_policy.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from functools import reduce
-
-
-class BaseTableMapPolicy():
-    def __init__(self, assign_groups=None):
-        self.table_create_infos = []
-        if assign_groups is None:
-            self.assign_groups = []
-        else:
-            self.assign_groups = assign_groups
-        self.in_slot_size_group = []
-        self.slot_to_table = []
-        self.table_to_output_slots = []
-        self.table_to_input_groups = []
-        self.table_to_slot = []
-
-    @staticmethod
-    def _is_equal_table_info(info1, info2):
-        if info1['embedding_dim'] != info2['embedding_dim']:  # dim of table is the same or not
-            print('embedding dim different!, value is %d and %d' % (info1['embedding_dim'], info2['embedding_dim']))
-            return False
-        return True
-
-    def map_table_infos(self, user_defined_table_infos):
-        raise NotImplementedError()
-
-    def _register_new_table_info(self, new_table_info):
-        self.table_create_infos.append(new_table_info)
-        self.table_to_output_slots.append([])
-        self.table_to_input_groups.append([])
-        self.table_to_slot.append([])
-
-    def _merge_new_table_info(self, new_table_info, assign_tabld_id):
-        main_table_info = self.table_create_infos[assign_tabld_id]
-        main_table_info['multihot_lens'] += new_table_info['multihot_lens']
-        main_table_info['max_vocabulary_size'] += new_table_info['max_vocabulary_size']
-
-    def _register_table_info(self, new_table_info, assign_tid=-1):
-        multihot_lens = new_table_info['multihot_lens']
-        in_slot_size = sum(multihot_lens)
-        out_slot_size = len(multihot_lens)
-
-        tid = assign_tid
-        if tid == -1:
-            tid = len(self.table_create_infos)
-            self._register_new_table_info(new_table_info)
-        else:
-            self._merge_new_table_info(new_table_info, tid)
-
-        self.table_to_slot[tid].append(len(self.in_slot_size_group))
-        self.table_to_output_slots[tid].append(in_slot_size)
-        self.in_slot_size_group.append(in_slot_size)
-        self.slot_to_table.append(tid)
-
-    def _map_table_infos(self, user_defined_table_infos, assign_groups):
-        self.table_create_infos = []
-        assign_groups_flat = reduce(lambda a, b: a+b, assign_groups, [])
-        sid_to_gid = reduce(lambda a, b: {**a, **b},
-                            [{sid: gid for sid in group}
-                             for gid, group in enumerate(assign_groups)], {})
-        gid_to_tid = dict()
-        for sid, table_info in enumerate(user_defined_table_infos):
-            if sid in assign_groups_flat:
-                gid = sid_to_gid.get(sid)
-                if gid in gid_to_tid:
-                    self._register_table_info(table_info, assign_tid=gid_to_tid.get(gid))
-                else:
-                    tid = len(self.table_create_infos)
-                    self._register_table_info(table_info, assign_tid=-1)
-                    gid_to_tid[gid] = tid
-            else:
-                self._register_table_info(table_info, assign_tid=-1)
-        return self.table_create_infos
-
-
-# no slot merge
-class NoneTableMapPolicy(BaseTableMapPolicy):
-    def map_table_infos(self, user_defined_table_infos):
-        return self._map_table_infos(user_defined_table_infos, self.assign_groups)
-
-
-# merge slot by user's assign_groups
-class AutoMergeTableMapPolicy(BaseTableMapPolicy):
-    def map_table_infos(self, user_defined_table_infos):
-        assign_groups_flat = reduce(lambda a, b: a+b, self.assign_groups, [])
-        new_assign_groups = []
-        for sid, table_info in enumerate(user_defined_table_infos):
-            if sid in assign_groups_flat:
-                continue
-            gid = -1
-            if user_defined_table_infos[sid]['allow_merge']:
-                for ngid, group in enumerate(new_assign_groups):
-                    if self._is_equal_table_info(user_defined_table_infos[group[0]], table_info) \
-                            and user_defined_table_infos[group[0]]['allow_merge']:
-                        gid = ngid
-                        break
-            if gid == -1:
-                gid = len(new_assign_groups)
-                new_assign_groups.append([])
-            new_assign_groups[gid].append(sid)
-        new_assign_groups = self.assign_groups + new_assign_groups
-        return self._map_table_infos(user_defined_table_infos, new_assign_groups)
diff --git a/tf_adapter/python/npu_bridge/embedding/tf_path.py b/tf_adapter/python/npu_bridge/embedding/tf_path.py
deleted file mode 100644
index a5717e652..000000000
--- a/tf_adapter/python/npu_bridge/embedding/tf_path.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.training import optimizer as embeddingOptimizer
-from npu_bridge.embedding.embedding_resource import NpuEmbeddingResource
-
-
-class _NpuEmbeddingResourceProcessor(embeddingOptimizer._OptimizableVariable):
-    """Processor for dense NpuEmbeddingResourceProcessor."""
-
-    def __init__(self, v):
-        self._v = v
-
-    def target(self):
-        return self._v
-
-    def update_op(self, optimizer, g):
-        return optimizer._resource_apply_sparse(g.values, self._v, g.indices)
-
-
-def _get_processor(v):
-    """The processor of v."""
-    if context.executing_eagerly():
-        if isinstance(v, ops.Tensor):
-            return embeddingOptimizer._TensorProcessor(v)
-        else:
-            return embeddingOptimizer._DenseResourceVariableProcessor(v)
-    if isinstance(v, NpuEmbeddingResource):
-        return _NpuEmbeddingResourceProcessor(v)
-    if resource_variable_ops.is_resource_variable(v) and not v._in_graph_mode:  # pylint: disable=protected-access
-        # True if and only if `v` was initialized eagerly.
-        return embeddingOptimizer._DenseResourceVariableProcessor(v)
-    if v.op.type == "VarHandleOp":
-        return embeddingOptimizer._DenseResourceVariableProcessor(v)
-    if isinstance(v, variables.Variable):
-        return embeddingOptimizer._RefVariableProcessor(v)
-    if isinstance(v, ops.Tensor):
-        return embeddingOptimizer._TensorProcessor(v)
-
-    raise NotImplementedError("Trying to optimize unsupported type ", v)
-
-
-def path_on_tf():
-    embeddingOptimizer._get_processor = _get_processor
-
-
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
index 6004fecb7..7209e0953 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
@@ -105,7 +105,6 @@ class NPURunConfig(run_config_lib.RunConfig):
                  stream_sync_timeout=-1,
                  event_sync_timeout=-1,
                  external_weight=False,
-                 es_cluster_config=None,
                  deterministic=0,
                  frozen_variable=False,
                  variable_placement="Device"
@@ -166,7 +165,6 @@ class NPURunConfig(run_config_lib.RunConfig):
         experimental_config: The experimental configuration.
         topo_sorting_mode: Provides an interface for users to customize topology sorting.
         external_weight: Whether convert const to fileconstant and save weight to file.
-        es_cluster_config: esClusterConfig from user input in embedding service.
         frozen_variable: Whether folding constant variables
         variable_placement: Process variable on host or device
         """
@@ -256,7 +254,6 @@ class NPURunConfig(run_config_lib.RunConfig):
         self.stream_sync_timeout = stream_sync_timeout
         self.event_sync_timeout = event_sync_timeout
         self._external_weight = external_weight
-        self.es_cluster_config = es_cluster_config
 
         super(NPURunConfig, self).__init__(
             model_dir=model_dir,
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
index c3199d8fb..9db0f67af 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
@@ -768,8 +768,6 @@ class NPUEstimator(estimator_lib.Estimator):
             custom_op.parameter_map["topo_sorting_mode"].i = config.topo_sorting_mode
         if config.insert_op_file is not None:
             custom_op.parameter_map["insert_op_file"].s = config.insert_op_file
-        if config.es_cluster_config is not None:
-            custom_op.parameter_map["es_cluster_config"].s = tf.compat.as_bytes(config.es_cluster_config)
         custom_op.parameter_map["stream_sync_timeout"].i = config.stream_sync_timeout
         custom_op.parameter_map["event_sync_timeout"].i = config.event_sync_timeout
         custom_op.parameter_map["external_weight"].b = config._external_weight
diff --git a/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py b/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py
index cf43c5536..1e91cbb8e 100644
--- a/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py
+++ b/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py
@@ -283,102 +283,3 @@ def non_zero_with_value_shape(value, index, count):
         index=index,
         count=count)
     return result
-
-
-class ESWorker:
-    """ Embedding service class. """
-    def __init__(self, es_cluster_config):
-        with open(es_cluster_config, encoding='utf-8') as a:
-            es_cluster_config_json = json.load(a)
-            self._es_cluster_conf = json.dumps(es_cluster_config_json)
-            self._ps_num = int(es_cluster_config_json["psNum"])
-            self._embedding_dim = -1
-            self._max_num = -1
-            self._ps_ids = []
-            self._ps_ids_list = es_cluster_config_json["psCluster"]
-            for each_ps in self._ps_ids_list:
-                self._ps_ids.append(each_ps["id"])
-
-        config = tf.ConfigProto()
-        custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
-        custom_op.name = "NpuOptimizer"
-        custom_op.parameter_map["es_cluster_config"].s = tf.compat.as_bytes(self._es_cluster_conf)
-        self.es_all_config = config
-
-    ## 提供embedding init功能
-    # @param bucket_size int 类型
-    # @param file_path string 类型
-    # @param file_name string 类型
-    # @param table_id uint32 类型
-    # @param embedding_dim uint32 类型
-    # @param max_batch_size uint32 类型
-    def embedding_init(self, bucket_size, file_path, file_name, table_id, embedding_dim, max_batch_size):
-        """ Operator for init embedding table. """
-        self._embedding_dim = embedding_dim
-        self._max_num = max_batch_size
-        ps_num = tf.constant(self._ps_num, dtype=tf.uint32, name='ps_num')
-        ps_ids = tf.constant(self._ps_ids, dtype=tf.uint32, name='ps_ids')
-        ps_engine_values = "PS"
-        ps_engine = attr_value_pb2.AttrValue(s=compat.as_bytes(ps_engine_values))
-        ps_num.op._set_attr("_process_node_engine_id", ps_engine)
-        ps_ids.op._set_attr("_process_node_engine_id", ps_engine)
-        init_partition_map = gen_npu_cpu_ops.init_partition_map(ps_num=ps_num,
-                                                                ps_ids=ps_ids)
-        table_id = tf.constant(table_id, dtype=tf.uint32, name='table_id')
-        table_id.op._set_attr("_process_node_engine_id", ps_engine)
-        with tf.control_dependencies([init_partition_map]):
-            init_embedding_hash_map = gen_npu_cpu_ops.init_embedding_hashmap(table_id=table_id, bucket_size=bucket_size)
-        file_name = tf.constant(file_name, dtype=tf.string, name='file_name')
-        file_path = tf.constant(file_path, dtype=tf.string, name='file_path')
-        embedding_dim = tf.constant(embedding_dim, dtype=tf.uint32, name="embedding_dim")
-        ps_id = -1
-        ps_id = tf.constant(ps_id, dtype=tf.uint32, name='ps_id')
-        ps_id.op._set_attr("_process_node_engine_id", ps_engine)
-        file_name.op._set_attr("_process_node_engine_id", ps_engine)
-        file_path.op._set_attr("_process_node_engine_id", ps_engine)
-        embedding_dim.op._set_attr("_process_node_engine_id", ps_engine)
-        with tf.control_dependencies([init_embedding_hash_map]):
-            embedding_table_import = gen_npu_cpu_ops.embedding_table_import(file_path=file_path,
-                                                                            file_name=file_name,
-                                                                            ps_id=ps_id,
-                                                                            table_id=table_id,
-                                                                            embedding_dim=embedding_dim)
-        init_partition_map._set_attr("_process_node_engine_id", ps_engine)
-        init_embedding_hash_map._set_attr("_process_node_engine_id", ps_engine)
-        embedding_table_import._set_attr("_process_node_engine_id", ps_engine)
-        execute_times_value = 1
-        execute_times = attr_value_pb2.AttrValue(i=execute_times_value)
-        embedding_table_import._set_attr("_execute_times", execute_times)
-        embedding_dim_value = 1
-        embedding_dim = attr_value_pb2.AttrValue(i=embedding_dim_value)
-        embedding_table_import._set_attr("_embedding_dim", embedding_dim)
-        max_num_value = self._max_num
-        max_num = attr_value_pb2.AttrValue(i=max_num_value)
-        embedding_table_import._set_attr("_max_num", max_num)
-        deploy_inject_config_value = self._es_cluster_conf
-        deploy_inject_config = attr_value_pb2.AttrValue(s=compat.as_bytes(deploy_inject_config_value))
-        embedding_table_import._set_attr("_deploy_inject_config", deploy_inject_config)
-        result = embedding_table_import
-        return result
-
-    # 提供embedding lookup功能
-    # @param table_id uint32 类型
-    # @param input_ids uint64 类型
-    # @return values float32 类型
-    def embedding_look_up(self, table_id, input_ids):
-        """ Operator for look up in embedding table. """
-        table_id = tf.constant(table_id, dtype=tf.uint32, name="table_id")
-        result = gen_npu_cpu_ops.embedding_table_find(table_id=table_id,
-                                                      keys=input_ids,
-                                                      embedding_dim=self._embedding_dim)
-        max_num_value = self._max_num
-        max_num = attr_value_pb2.AttrValue(i=max_num_value)
-        result.op._set_attr("_max_num", max_num)
-        if self._embedding_dim == -1:
-            self._embedding_dim = 4
-        embedding_dim_value = attr_value_pb2.AttrValue(i=self._embedding_dim)
-        result.op._set_attr("_embedding_dim", embedding_dim_value)
-        deploy_inject_config_value = self._es_cluster_conf
-        deploy_inject_config = attr_value_pb2.AttrValue(s=compat.as_bytes(deploy_inject_config_value))
-        result.op._set_attr("_deploy_inject_config", deploy_inject_config)
-        return result
\ No newline at end of file
diff --git a/tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt b/tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt
index 5fd416577..58575959f 100644
--- a/tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt
+++ b/tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt
@@ -113,24 +113,6 @@ node {
       s: "dynamic_execute"
     }
   }
-  attr {
-    key: "_execute_times"
-    value {
-      s: "2"
-    }
-  }
-  attr {
-    key: "_max_num"
-    value {
-      s: "1"
-    }
-  }
-  attr {
-    key: "_embedding_dim"
-    value {
-      s: "1"
-    }
-  }
   attr {
     key: "_dynamic_input"
     value {
diff --git a/tf_adapter/tests/st/optimizers/testcase/get_attr_optimize_pass_test.cc b/tf_adapter/tests/st/optimizers/testcase/get_attr_optimize_pass_test.cc
index a6fee094f..0649fc74a 100644
--- a/tf_adapter/tests/st/optimizers/testcase/get_attr_optimize_pass_test.cc
+++ b/tf_adapter/tests/st/optimizers/testcase/get_attr_optimize_pass_test.cc
@@ -210,9 +210,6 @@ TEST_F(GetAttrOptimizationPassTest, SetAttrTest) {
   AttrValue insert_op_file = AttrValue();
   insert_op_file.set_s("aipp.cfg");
   (*custom_config->mutable_parameter_map())["insert_op_file"] = insert_op_file;
-  AttrValue es_cluster_config = AttrValue();
-  es_cluster_config.set_s("es");
-  (*custom_config->mutable_parameter_map())["es_cluster_config"] = es_cluster_config;
   AttrValue external_weight = AttrValue();
   external_weight.set_b(true);
   (*custom_config->mutable_parameter_map())["external_weight"] = external_weight;
diff --git a/tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt b/tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt
index 0409c9ba4..58575959f 100644
--- a/tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt
+++ b/tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt
@@ -113,30 +113,6 @@ node {
       s: "dynamic_execute"
     }
   }
-  attr {
-    key: "_deploy_inject_config"
-    value {
-      s: "deploy_inject_config"
-    }
-  }
-  attr {
-    key: "_execute_times"
-    value {
-      s: "2"
-    }
-  }
-  attr {
-    key: "_max_num"
-    value {
-      s: "1"
-    }
-  }
-  attr {
-    key: "_embedding_dim"
-    value {
-      s: "1"
-    }
-  }
   attr {
     key: "_dynamic_input"
     value {
diff --git a/tf_adapter/tests/ut/optimizers/testcase/get_attr_optimize_pass_test.cc b/tf_adapter/tests/ut/optimizers/testcase/get_attr_optimize_pass_test.cc
index 151838b58..0649fc74a 100644
--- a/tf_adapter/tests/ut/optimizers/testcase/get_attr_optimize_pass_test.cc
+++ b/tf_adapter/tests/ut/optimizers/testcase/get_attr_optimize_pass_test.cc
@@ -210,9 +210,6 @@ TEST_F(GetAttrOptimizationPassTest, SetAttrTest) {
   AttrValue insert_op_file = AttrValue();
   insert_op_file.set_s("aipp.cfg");
   (*custom_config->mutable_parameter_map())["insert_op_file"] = insert_op_file;
-  AttrValue es_cluster_config = AttrValue();
-  es_cluster_config.set_s("esclusterconfig.json");
-  (*custom_config->mutable_parameter_map())["es_cluster_config"] = es_cluster_config;
   AttrValue external_weight = AttrValue();
   external_weight.set_b(true);
   (*custom_config->mutable_parameter_map())["external_weight"] = external_weight;
diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc
index 45de04adf..a071d8091 100644
--- a/tf_adapter/util/npu_attrs.cc
+++ b/tf_adapter/util/npu_attrs.cc
@@ -576,7 +576,6 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
   std::string aoe_config_file;
   std::string stream_sync_timeout = "-1";
   std::string event_sync_timeout = "-1";
-  std::string es_cluster_config;
 
   if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) {
     (void) ctx->GetAttr("_precision_mode", &precision_mode);
@@ -615,7 +614,6 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
     (void) ctx->GetAttr("_aoe_config_file", &aoe_config_file);
     (void) ctx->GetAttr("_stream_sync_timeout", &stream_sync_timeout);
     (void) ctx->GetAttr("_event_sync_timeout", &event_sync_timeout);
-    (void) ctx->GetAttr("_es_cluster_config", &es_cluster_config);
   }
 
   if (precision_mode.empty()) {
@@ -666,7 +664,6 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
   init_options_["ge.aoe_config_file"] = aoe_config_file;
   init_options_["stream_sync_timeout"] = stream_sync_timeout;
   init_options_["event_sync_timeout"] = event_sync_timeout;
-  init_options_["ge.esClusterConfig"] = es_cluster_config;
 
   return init_options_;
 }
@@ -1067,7 +1064,6 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   std::string stream_sync_timeout = "-1";
   std::string event_sync_timeout = "-1";
   std::string external_weight = "0";
-  std::string es_cluster_config;
   std::string graph_parallel_option_path;
   std::string enable_graph_parallel;
 
@@ -1147,7 +1143,6 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   auto model_deploy_devicelist_value = attrs.Find("_model_deploy_devicelist");
   auto topo_sorting_mode_value = attrs.Find("_topo_sorting_mode");
   auto insert_op_file_value = attrs.Find("_insert_op_file");
-  auto es_cluster_config_value = attrs.Find("_es_cluster_config");
   auto resource_config_path_value = attrs.Find("_resource_config_path");
   auto aoe_config_file_value = attrs.Find("_aoe_config_file");
   auto stream_sync_timeout_value = attrs.Find("_stream_sync_timeout");
@@ -1428,9 +1423,6 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
     if (external_weight_value != nullptr) {
       external_weight = external_weight_value->s();
     }
-    if (es_cluster_config_value != nullptr) {
-      es_cluster_config = es_cluster_config_value->s();
-    }
   }
 
   all_options["variable_format_optimize"] = variable_format_optimize;
@@ -1518,8 +1510,6 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   all_options["ge.topoSortingMode"] = topo_sorting_mode;
   all_options["insert_op_file"] = insert_op_file;
   all_options["ge.insertOpFile"] = insert_op_file;
-  all_options["es_cluster_config"] = es_cluster_config;
-  all_options["ge.esClusterConfig"] = es_cluster_config;
   all_options["resource_config_path"] = resource_config_path;
   all_options["ge.aoe_config_file"] = aoe_config_file;
   all_options["aoe_config_file"] = aoe_config_file;
@@ -2063,11 +2053,6 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
       if (params.count("external_weight") > 0) {
         external_weight = params.at("external_weight").b();
       }
-      if (params.count("es_cluster_config") > 0) {
-        std::string es_cluster_config = params.at("es_cluster_config").s();
-        init_options_["es_cluster_config"] = es_cluster_config;
-        init_options_["ge.esClusterConfig"] = es_cluster_config;
-      }
       if (params.count("frozen_variable") > 0) {
         frozen_variable = params.at("frozen_variable").b();
       }
-- 
Gitee


From eb4c01a3f7e893cbc855dde69c8ef037e5dfc096 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=86=E8=A1=A1?= <luheng@huawei.com>
Date: Thu, 16 Mar 2023 09:19:52 +0000
Subject: [PATCH 02/22] =?UTF-8?q?!2109=20=E4=BF=AE=E5=A4=8D=E6=B7=B7?=
 =?UTF-8?q?=E5=90=88=E8=AE=A1=E7=AE=97=E7=9A=84bug+=E5=88=A0=E9=99=A4estim?=
 =?UTF-8?q?ator=E6=A8=A1=E5=BC=8F=E4=B8=8B=E5=BA=9F=E5=BC=83=E5=8F=82?=
 =?UTF-8?q?=E6=95=B0=E9=BB=98=E8=AE=A4=E5=80=BC=20Merge=20pull=20request?=
 =?UTF-8?q?=20!2109=20from=20=E9=99=86=E8=A1=A1/cherry-pick-1678956907?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/interface_spec/api_npu_config.pyh                | 2 +-
 tf_adapter/optimizers/om_partition_subgraphs_pass.cc        | 4 +++-
 tf_adapter/python/npu_bridge/estimator/npu/npu_config.py    | 2 +-
 tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py | 3 ++-
 tf_adapter/util/util.cc                                     | 1 -
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh
index 718db2ac6..815cd35a9 100644
--- a/tf_adapter/interface_spec/api_npu_config.pyh
+++ b/tf_adapter/interface_spec/api_npu_config.pyh
@@ -12,7 +12,7 @@ class NPURunConfig(run_config_lib.RunConfig):
                 enable_exception_dump=0, op_select_implmode=None, optypelist_for_implmode=None, dynamic_input_config=None,
                 aoe_mode=None, work_path=None, buffer_optimize="l2_optimize", enable_small_channel=0, fusion_switch_file=None,
                 enable_compress_weight=False, compress_weight_conf=None,
-                op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=False,
+                op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=None,
                 dynamic_graph_execute_mode="dynamic_execute", dynamic_inputs_shape_range=None,
                 train_distribute=None, eval_distribute=None, local_rank_id=None, local_device_list=None, session_device_id=None,
                 distribute_config=None, modify_mixlist=None, op_precision_mode=None, device_type="default_device_type",
diff --git a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc
index bb54e52d8..66fd524ad 100644
--- a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc
+++ b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc
@@ -2244,7 +2244,9 @@ Status OMPartitionSubgraphsPass::ProcessGraph(std::unique_ptr<Graph> *graph, Fun
     return Status::OK();
   }
   if (mix_compile_mode) {
-    TF_RETURN_IF_ERROR(CopyVarsBetweenGeOp(graph_in));
+    if (pass_options["variable_location"] != "Host") {
+      TF_RETURN_IF_ERROR(CopyVarsBetweenGeOp(graph_in));
+    }
     TF_RETURN_IF_ERROR(CopyConstBetweenGeOp(graph_in));
   }
 
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
index 7209e0953..50536b343 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
@@ -78,7 +78,7 @@ class NPURunConfig(run_config_lib.RunConfig):
                  op_compiler_cache_dir=None,
                  debug_dir=None,
                  hcom_multi_mode=False,
-                 dynamic_input=False,
+                 dynamic_input=None,
                  dynamic_graph_execute_mode="dynamic_execute",
                  dynamic_inputs_shape_range=None,
                  train_distribute=None,
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
index 9db0f67af..eff80548b 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
@@ -740,7 +740,8 @@ class NPUEstimator(estimator_lib.Estimator):
         if config._debug_dir is not None:
             custom_op.parameter_map["debug_dir"].s = tf.compat.as_bytes(config._debug_dir)
         custom_op.parameter_map["hcom_multi_mode"].b = config._hcom_multi_mode
-        custom_op.parameter_map["dynamic_input"].b = config._dynamic_input
+        if config._dynamic_input is not None:
+            custom_op.parameter_map["dynamic_input"].b = config._dynamic_input
         custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes(config._dynamic_graph_execute_mode)
         if config._dynamic_inputs_shape_range is not None:
             custom_op.parameter_map["dynamic_inputs_shape_range"].s = tf.compat.as_bytes(
diff --git a/tf_adapter/util/util.cc b/tf_adapter/util/util.cc
index f7a1e8c81..8cb0b1fc3 100644
--- a/tf_adapter/util/util.cc
+++ b/tf_adapter/util/util.cc
@@ -115,7 +115,6 @@ bool IsVariableOrResourceVariable(const Node * const node) {
 
 bool IsVariableExecuteOnHost(const Node * const node, const std::string &variable_location) {
   if (variable_location == "Host" && IsVariableOrResourceVariable(node)) {
-    ADP_LOG(INFO) << "Node : " << node->name() << " op name : " << node->type_string() << "is execute on host";
     return true;
   }
   return false;
-- 
Gitee


From e21c15f373b1bfb8b55d2cd53bdc6b5034070778 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=BC=BA?= <liuqiang238@hisilicon.com>
Date: Thu, 16 Mar 2023 09:28:55 +0000
Subject: [PATCH 03/22] =?UTF-8?q?!2110=20OBP=20=E5=95=86=E5=88=86=E4=BF=AE?=
 =?UTF-8?q?=E5=A4=8D=20=E5=88=A0=E9=99=A4embedding=E5=BC=95=E5=85=A5?=
 =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98=20Merge=20pull=20request=20!2110=20?=
 =?UTF-8?q?from=20=E5=88=98=E5=BC=BA/lq=5F316=5Flast?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/python/npu_bridge/npu_init.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tf_adapter/python/npu_bridge/npu_init.py b/tf_adapter/python/npu_bridge/npu_init.py
index 7fce9a40e..02f31f04c 100644
--- a/tf_adapter/python/npu_bridge/npu_init.py
+++ b/tf_adapter/python/npu_bridge/npu_init.py
@@ -65,7 +65,6 @@ from hccl.manage.api import get_world_rank_from_group_rank
 from hccl.manage.api import get_group_rank_from_world_rank
 from hccl.split.api import set_split_strategy_by_idx
 from hccl.split.api import set_split_strategy_by_size
-from npu_bridge import embedding as npu_embedding
 
 import tensorflow as tf
 
-- 
Gitee


From 52c48c2a38c5443b64419ffd083226fcdf50a05c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AD=8F=E4=B8=B9=E4=B8=B9?= <weidandan6@hisilicon.com>
Date: Thu, 16 Mar 2023 12:48:37 +0000
Subject: [PATCH 04/22] =?UTF-8?q?!2112=20=E6=B8=85=E7=90=86=E5=91=8A?=
 =?UTF-8?q?=E8=AD=A6=20Merge=20pull=20request=20!2112=20from=20=E9=AD=8F?=
 =?UTF-8?q?=E4=B8=B9=E4=B8=B9/flc29?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/kernels/aicpu/host_queue_dataset_op.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc
index b46c6e0e8..f661ffdc5 100644
--- a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc
+++ b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc
@@ -597,8 +597,8 @@ class HostQueueDatasetOp : public DatasetOpKernel {
         ADP_LOG(INFO) << "Slave SendDataThread exit.";
       }
 
-      void RecordMbufQueueBytes(const bool is_hold_type, const uint64_t args_total_bytes) {
-        if (!is_hold_type) { return; }
+      void RecordMbufQueueBytes(const bool is_hold, const uint64_t args_total_bytes) {
+        if (!is_hold) { return; }
         mbuf_queue_rear_ = (mbuf_queue_rear_ + 1) % kStringTypeDepth;
         mbuf_queue_bytes_[mbuf_queue_rear_] = args_total_bytes;
       }
@@ -629,7 +629,7 @@ class HostQueueDatasetOp : public DatasetOpKernel {
         Status status = Status::OK();
         bool is_need_resend = false;
 
-        while(!finish_send_) {
+        while (!finish_send_) {
           if (IsHoldDataTrans()) {
             auto start = std::chrono::high_resolution_clock::now();
             auto end = start + std::chrono::microseconds(kSleepDuration);
-- 
Gitee


From 3464689d58eab9a8f2b04862a8c9ef0585061a93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=86=E8=A1=A1?= <luheng@huawei.com>
Date: Tue, 21 Mar 2023 07:22:49 +0000
Subject: [PATCH 05/22] =?UTF-8?q?!2135=20frozen=20variable=E5=A2=9E?=
 =?UTF-8?q?=E5=8A=A0=E6=A0=A1=E9=AA=8C=20Merge=20pull=20request=20!2135=20?=
 =?UTF-8?q?from=20=E9=99=86=E8=A1=A1/cherry-pick-1679301364?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../testcase/frozen_variable_pass_test.cc     | 13 +++--
 .../tests/st/util/testcase/npu_attrs_test.cc  |  8 +++-
 .../testcase/frozen_variable_pass_test.cc     | 47 ++++++++++++-------
 .../tests/ut/util/testcase/npu_attrs_test.cc  | 14 ++++--
 tf_adapter/util/npu_attrs.cc                  | 19 ++++++--
 5 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/tf_adapter/tests/st/optimizers/testcase/frozen_variable_pass_test.cc b/tf_adapter/tests/st/optimizers/testcase/frozen_variable_pass_test.cc
index 1176d7738..b16612497 100644
--- a/tf_adapter/tests/st/optimizers/testcase/frozen_variable_pass_test.cc
+++ b/tf_adapter/tests/st/optimizers/testcase/frozen_variable_pass_test.cc
@@ -84,7 +84,7 @@ public:
     return strings::StrCat(absl::StrJoin(edges, ";"));
   }
 
-  string DoRunFrozenVariablePassTest(bool need_frozen) {
+  string DoRunFrozenVariablePassTest(bool need_frozen, const string &placement) {
     string before = CanonicalGraphString(graph_.get());
     LOG(INFO) << "Before replace variable pass: " << before;
 
@@ -95,7 +95,10 @@ public:
     custom_config->set_name("NpuOptimizer");
     AttrValue is_need_frozen = AttrValue();
     is_need_frozen.set_b(need_frozen);
+    AttrValue variable_placement = AttrValue();
+    variable_placement.set_s(placement);
     (*custom_config->mutable_parameter_map())["frozen_variable"] = is_need_frozen;
+    (*custom_config->mutable_parameter_map())["variable_placement"] = variable_placement;
     options.session_options = &session_options;
     options.graph = ug;
     FunctionLibraryDefinition flib_def((*ug)->flib_def());
@@ -127,21 +130,21 @@ TEST_F(FrozenVariablePassTest, frozen_variable_true) {
   string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt";
   InitGraph(org_graph_def_path);
   std::string target_graph = "Const->Add;Add->_Retval;PartitionedCall->Add:1";
-  EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph);
+  EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph);
 }
 
 TEST_F(FrozenVariablePassTest, frozen_variable_false) {
   string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt";
   InitGraph(org_graph_def_path);
   std::string target_graph = "VariableV2->Identity;Const->Add;Identity->Add:1;Add->_Retval";
-  EXPECT_EQ(DoRunFrozenVariablePassTest(false), target_graph);
+  EXPECT_EQ(DoRunFrozenVariablePassTest(false, "Host"), target_graph);
 }
 
 TEST_F(FrozenVariablePassTest, frozen_varhandleop_true) {
   string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/varhandleop_test.pbtxt";
   InitGraph(org_graph_def_path);
   std::string target_graph = "Const->Add;Add->_Retval;PartitionedCall->Add:1";
-  EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph);
+  EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph);
 }
 
 TEST_F(FrozenVariablePassTest, frozen_no_variable_true) {
@@ -151,7 +154,7 @@ TEST_F(FrozenVariablePassTest, frozen_no_variable_true) {
                              "Less->LoopCond;Merge->Switch;LoopCond->Switch:1;Switch->Exit;Exit->_Retval;"
                              "Switch:1->Identity;Identity:control->Const:control;Const->Add;Identity->Add:1;"
                              "Add->NextIteration;NextIteration->Merge:1";
-  EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph);
+  EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph);
 }
 } // end namespace
 }
\ No newline at end of file
diff --git a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc
index 1b9b91e13..affdb61c4 100644
--- a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc
+++ b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc
@@ -11,6 +11,7 @@
 
 namespace tensorflow {
 Status CheckOpImplMode(const string &op_select_implmode);
+Status CheckVariablePlacement(const std::string &variable_placement);
 namespace {
 class NpuAttrTest : public testing::Test {
  protected:
@@ -64,7 +65,7 @@ TEST_F(NpuAttrTest, SplitTest) {
   Split(s, res, ",");
   EXPECT_EQ(res[2], "c");
 }
-TEST_F(NpuAttrTest, SetNpuOptimizerAttr) {
+TEST_F(NpuAttrTest, CheckOpImplMode) {
   Status s = CheckOpImplMode("xxx");
   EXPECT_EQ(s.ok(), false);
 }
@@ -100,6 +101,11 @@ TEST_F(NpuAttrTest, CheckPrecisionMode ) {
   EXPECT_EQ(s.ok(), false);
 }
 
+TEST_F(NpuAttrTest, CheckVariablePlacement) {
+  Status s = CheckVariablePlacement("sss");
+  EXPECT_EQ(s.ok(), false);
+}
+
 TEST_F(NpuAttrTest, GetDumpPath) {
   setenv("DUMP_GRAPH_PATH", "./", 1);
   string path = GetDumpPath();
diff --git a/tf_adapter/tests/ut/optimizers/testcase/frozen_variable_pass_test.cc b/tf_adapter/tests/ut/optimizers/testcase/frozen_variable_pass_test.cc
index 1176d7738..8dc75d988 100644
--- a/tf_adapter/tests/ut/optimizers/testcase/frozen_variable_pass_test.cc
+++ b/tf_adapter/tests/ut/optimizers/testcase/frozen_variable_pass_test.cc
@@ -16,20 +16,28 @@ namespace tensorflow {
 class FrozenVariablePass : public GraphOptimizationPass {
  public:
   FrozenVariablePass() = default;
+
   ~FrozenVariablePass() override = default;
+
   Status Run(const GraphOptimizationPassOptions &options) override;
+
  private:
   std::map<std::string, std::string> GetGraphConfigs(const Graph &graph);
+
   Status DoConstantFolding(const GraphOptimizationPassOptions &options, const uint64_t index);
-  bool IsAllOutputsIdentity(const Node * const node);
-  bool IsAllOutputsReadOp(const Node * const node);
-  bool IsNeedBuildPartitionedCall(const Node * const node);
+
+  bool IsAllOutputsIdentity(const Node *const node);
+
+  bool IsAllOutputsReadOp(const Node *const node);
+
+  bool IsNeedBuildPartitionedCall(const Node *const node);
 };
 
 namespace {
 class FrozenVariablePassTest : public testing::Test {
-public:
+ public:
   FrozenVariablePassTest() : graph_(absl::make_unique<Graph>(OpRegistry::Global())) {}
+
   static void InitGraph(const string &graph_def_path, Graph *graph) {
     GraphDef graph_def;
     ReadTextProto(Env::Default(), graph_def_path, &graph_def);
@@ -38,7 +46,7 @@ public:
   }
 
   void InitGraph(const string &graph_def_path) {
-    char trusted_path[MMPA_MAX_PATH] = { "\0" };
+    char trusted_path[MMPA_MAX_PATH] = {"\0"};
     if (mmRealPath(graph_def_path.c_str(), trusted_path, MMPA_MAX_PATH) != EN_OK) {
       LOG(ERROR) << "Get real path failed.";
       return;
@@ -50,7 +58,7 @@ public:
 
   static bool IncludeNode(const Node *n) { return n->IsOp(); }
 
-  static string EdgeId(const Node* n, int index) {
+  static string EdgeId(const Node *n, int index) {
     if (index == 0) {
       return n->type_string();
     } else if (index == Graph::kControlSlot) {
@@ -60,8 +68,8 @@ public:
     }
   }
 
-  string CanonicalGraphString(Graph* g) {
-    for (Node* n : g->nodes()) {
+  string CanonicalGraphString(Graph *g) {
+    for (Node *n : g->nodes()) {
       if (IncludeNode(n)) {
         if (n->type_string() == "Add" && n->assigned_device_name().empty()) {
           n->set_assigned_device_name("/job:localhost/replica:0/task:0/device:CPU:0");
@@ -74,7 +82,7 @@ public:
     }
 
     std::vector<string> edges;
-    for (const Edge* e : g->edges()) {
+    for (const Edge *e : g->edges()) {
       if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
         edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
                                         EdgeId(e->dst(), e->dst_input())));
@@ -84,28 +92,32 @@ public:
     return strings::StrCat(absl::StrJoin(edges, ";"));
   }
 
-  string DoRunFrozenVariablePassTest(bool need_frozen) {
+  string DoRunFrozenVariablePassTest(bool need_frozen, const string &placement) {
     string before = CanonicalGraphString(graph_.get());
     LOG(INFO) << "Before replace variable pass: " << before;
 
     std::unique_ptr<Graph> *ug = &graph_;
     GraphOptimizationPassOptions options;
     SessionOptions session_options;
-    auto *custom_config = session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers();
+    auto *custom_config =
+        session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers();
     custom_config->set_name("NpuOptimizer");
     AttrValue is_need_frozen = AttrValue();
     is_need_frozen.set_b(need_frozen);
+    AttrValue variable_placement = AttrValue();
+    variable_placement.set_s(placement);
     (*custom_config->mutable_parameter_map())["frozen_variable"] = is_need_frozen;
+    (*custom_config->mutable_parameter_map())["variable_placement"] = variable_placement;
     options.session_options = &session_options;
     options.graph = ug;
     FunctionLibraryDefinition flib_def((*ug)->flib_def());
     options.flib_def = &flib_def;
 
     DeviceSet device_set;
-    DeviceFactory* cpu_factory = DeviceFactory::GetFactory("CPU");
+    DeviceFactory *cpu_factory = DeviceFactory::GetFactory("CPU");
     std::vector<std::unique_ptr<Device>> devices;
     cpu_factory->CreateDevices(
-            session_options, "/job:localhost/replica:0/task:0", &devices);
+        session_options, "/job:localhost/replica:0/task:0", &devices);
     device_set.AddDevice(devices.begin()->get());
     options.device_set = &device_set;
     FrozenVariablePass().Run(options);
@@ -120,6 +132,7 @@ public:
   string original_;
  protected:
   virtual void SetUp() { *const_cast<bool *>(&kDumpGraph) = true; }
+
   virtual void TearDown() {}
 };
 
@@ -127,21 +140,21 @@ TEST_F(FrozenVariablePassTest, frozen_variable_true) {
   string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt";
   InitGraph(org_graph_def_path);
   std::string target_graph = "Const->Add;Add->_Retval;PartitionedCall->Add:1";
-  EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph);
+  EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph);
 }
 
 TEST_F(FrozenVariablePassTest, frozen_variable_false) {
   string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt";
   InitGraph(org_graph_def_path);
   std::string target_graph = "VariableV2->Identity;Const->Add;Identity->Add:1;Add->_Retval";
-  EXPECT_EQ(DoRunFrozenVariablePassTest(false), target_graph);
+  EXPECT_EQ(DoRunFrozenVariablePassTest(false, "Host"), target_graph);
 }
 
 TEST_F(FrozenVariablePassTest, frozen_varhandleop_true) {
   string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/varhandleop_test.pbtxt";
   InitGraph(org_graph_def_path);
   std::string target_graph = "Const->Add;Add->_Retval;PartitionedCall->Add:1";
-  EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph);
+  EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph);
 }
 
 TEST_F(FrozenVariablePassTest, frozen_no_variable_true) {
@@ -151,7 +164,7 @@ TEST_F(FrozenVariablePassTest, frozen_no_variable_true) {
                              "Less->LoopCond;Merge->Switch;LoopCond->Switch:1;Switch->Exit;Exit->_Retval;"
                              "Switch:1->Identity;Identity:control->Const:control;Const->Add;Identity->Add:1;"
                              "Add->NextIteration;NextIteration->Merge:1";
-  EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph);
+  EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph);
 }
 } // end namespace
 }
\ No newline at end of file
diff --git a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc
index dc796242b..c3d2c71ab 100644
--- a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc
+++ b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc
@@ -11,6 +11,7 @@
 
 namespace tensorflow {
 Status CheckOpImplMode(const string &op_select_implmode);
+Status CheckVariablePlacement(const std::string &variable_placement);
 namespace {
 class NpuAttrTest : public testing::Test {
  protected:
@@ -41,10 +42,10 @@ TEST_F(NpuAttrTest, GetEnvDeviceIdNotIntFailTest) {
   EXPECT_EQ(s.ok(), false);
 }
 TEST_F(NpuAttrTest, GetEnvAscendDeviceIdNotIntFailTest) {
-uint32_t device_id = 0;
-setenv("ASCEND_DEVICE_ID", "1.1", true);
-Status s = GetEnvDeviceID(device_id);
-EXPECT_EQ(s.ok(), false);
+  uint32_t device_id = 0;
+  setenv("ASCEND_DEVICE_ID", "1.1", true);
+  Status s = GetEnvDeviceID(device_id);
+  EXPECT_EQ(s.ok(), false);
 }
 TEST_F(NpuAttrTest, GetEnvDeviceIdEmptyTest) {
   uint32_t device_id = 0;
@@ -101,6 +102,11 @@ TEST_F(NpuAttrTest, CheckPrecisionMode ) {
   EXPECT_EQ(s.ok(), false);
 }
 
+TEST_F(NpuAttrTest, CheckVariablePlacement) {
+  Status s = CheckVariablePlacement("sss");
+  EXPECT_EQ(s.ok(), false);
+}
+
 TEST_F(NpuAttrTest, GetDumpPath) {
 setenv("DUMP_GRAPH_PATH", "./", 1);
 string path = GetDumpPath();
diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc
index a071d8091..689a406b8 100644
--- a/tf_adapter/util/npu_attrs.cc
+++ b/tf_adapter/util/npu_attrs.cc
@@ -247,7 +247,6 @@ inline Status CheckPath(const std::string &input, std::string &output) {
 Status CheckOpImplMode(const std::string &op_select_implmode) {
   std::set<string> op_impl_mode_list = {"high_precision", "high_performance", "high_precision_for_all",
                                         "high_performance_for_all"};
-
   if (op_impl_mode_list.find(op_select_implmode) != op_impl_mode_list.end()) {
     return Status::OK();
   } else {
@@ -256,6 +255,15 @@ Status CheckOpImplMode(const std::string &op_select_implmode) {
   }
 }
 
+Status CheckVariablePlacement(const std::string &variable_placement) {
+  std::set<string> variable_placement_list = {"Host", "Device"};
+  if (variable_placement_list.find(variable_placement) != variable_placement_list.end()) {
+    return Status::OK();
+  } else {
+    return errors::InvalidArgument("variable placement should be one of the list:[Host, Device]");
+  }
+}
+
 inline Status CheckAoeMode(const std::string &aoe_mode) {
   std::set<string> aoe_mode_list = {"1", "2", "4", "mdat"};
 
@@ -766,6 +774,11 @@ std::map<std::string, std::string> NpuAttrs::GetPassOptions(const GraphOptimizat
       }
       if (params.count("variable_placement") > 0) {
         variable_location = params.at("variable_placement").s();
+        Status s = CheckVariablePlacement(variable_location);
+        if (!s.ok()) {
+          ADP_LOG(ERROR) << s.error_message();
+          LOG(FATAL) << s.error_message();
+        }
       }
     }
   }
@@ -821,8 +834,8 @@ std::map<std::string, std::string> NpuAttrs::GetPassOptions(const OpKernelConstr
   std::string in_out_pair_flag = "1";
   std::string in_out_pair;
   std::string npuOptimizer;
-  std::string frozen_variable = "0";
   std::string variable_location = "Device";
+  std::string frozen_variable = "0";
 
   if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) {
     do_npu_optimizer = "1";
@@ -2195,7 +2208,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   pass_options["local_device_list"] = local_device_list;
   pass_options["in_out_pair_flag"] = std::to_string(static_cast<int32_t>(in_out_pair_flag));
   pass_options["in_out_pair"] = in_out_pair;
-  pass_options["frozen_variable"] = frozen_variable;
+  pass_options["frozen_variable"] = std::to_string(static_cast<int32_t>(frozen_variable));
   pass_options["variable_location"] = variable_location;
 
   if (!node) {
-- 
Gitee


From 975606f6971f1a01a78acd24aa52a270ecb5414b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E6=98=8A?= <lianghao24@hisilicon.com>
Date: Tue, 21 Mar 2023 08:54:43 +0000
Subject: [PATCH 06/22] =?UTF-8?q?!2120=20misra=20=20zhaolupeng=20c29=20Mer?=
 =?UTF-8?q?ge=20pull=20request=20!2120=20from=20=E6=A2=81=E6=98=8A/cherry-?=
 =?UTF-8?q?pick-1679022854?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../core/optimizers/meta/npu_hcom_tailing_optimizer.cpp     | 2 +-
 .../meta/npu_weight_update_grouping_optimizer.cpp           | 4 ++--
 .../npu_device/core/optimizers/runtime/node_placer.cpp      | 6 +++---
 .../core/optimizers/runtime/npu_build_npu_op_optimizer.cpp  | 4 ++--
 .../runtime/npu_trans_resource_input_to_node_optimizer.cpp  | 3 ++-
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_hcom_tailing_optimizer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_hcom_tailing_optimizer.cpp
index afee34778..50634d802 100644
--- a/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_hcom_tailing_optimizer.cpp
+++ b/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_hcom_tailing_optimizer.cpp
@@ -56,7 +56,7 @@ tensorflow::Status TailingOptimizeInner(tensorflow::FunctionLibraryDefinition *l
         }
       }
     }
-    if (node->type_string() == kNpuAllocFloatStatusOp && node->attrs().Find(kNpuLossScaleAttr) != nullptr) {
+    if ((node->type_string() == kNpuAllocFloatStatusOp) && (node->attrs().Find(kNpuLossScaleAttr) != nullptr)) {
       std::unordered_set<const tensorflow::Edge *> edges_to_remove;
       tensorflow::Node *last_allreduce = nullptr;
       for (auto in_edge : node->in_edges()) {
diff --git a/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_weight_update_grouping_optimizer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_weight_update_grouping_optimizer.cpp
index d36a3ca41..a10281e09 100644
--- a/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_weight_update_grouping_optimizer.cpp
+++ b/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_weight_update_grouping_optimizer.cpp
@@ -57,7 +57,7 @@ tensorflow::Status WeightUpdateGroupingOptimizeInner(tensorflow::FunctionLibrary
       }
     }
 
-    if (node->type_string() == kHcomBroadcast && node->attrs().Find(kWeightUpdateGroupingAttr) != nullptr) {
+    if ((node->type_string() == kHcomBroadcast) && (node->attrs().Find(kWeightUpdateGroupingAttr) != nullptr)) {
       std::unordered_set<const tensorflow::Edge *> edges_to_remove;
       tensorflow::Node *read_var_node = nullptr;
       for (auto in_edge : node->in_edges()) {
@@ -114,7 +114,7 @@ tensorflow::Status WeightUpdateGroupingOptimizeInner(tensorflow::FunctionLibrary
       (void)graph->AddEdge(var_node, 0, new_read_var_node, 0);
       (void)graph->AddEdge(new_read_var_node, 0, node, 0);
       for (auto var_edge : var_node->out_edges()) {
-        if (var_edge->dst() != new_read_var_node && var_edge->dst() != assign_node) {
+        if ((var_edge->dst() != new_read_var_node) && (var_edge->dst() != assign_node)) {
           (void)graph->AddControlEdge(assign_node, var_edge->dst());
         }
       }
diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp
index 92d8c163b..5549135b7 100644
--- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp
+++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp
@@ -316,7 +316,7 @@ tensorflow::Status NodePlacer::PlaceCpuNodeSubgraphs(size_t depth) const {
 bool NodePlacer::IsClusterMustPlaceOnNpu(const Cluster &cluster) {
   for (auto node : cluster.nodes) {
     auto iter = node_placement_.find(node);
-    if (iter != node_placement_.end() && iter->second == Placement::NPU) {
+    if ((iter != node_placement_.end()) && (iter->second == Placement::NPU)) {
       DLOG() << cluster.name << " must place on npu as has determined npu node " << node->name();
       return true;
     }
@@ -405,7 +405,7 @@ void NodePlacer::Concrete(tensorflow::Node *src, tensorflow::Node *dst) {
   DLOG() << "Concrete node " << src->name() << " with " << dst->name() << " to cluster " << target->name;
 
   auto iter = concrete_clusters_.find(src);
-  if (iter != concrete_clusters_.end() && iter->second == target) {
+  if ((iter != concrete_clusters_.end()) && (iter->second == target)) {
     DLOG() << "Node " << src->name() << " has already concrete with " << dst->name() << " in cluster " << target->name;
     return;
   }
@@ -468,7 +468,7 @@ tensorflow::Status NodePlacer::BuildConcreteCluster() {
     std::queue<std::shared_ptr<Cluster>> q;
     for (auto &node : cluster->nodes) {
       auto iter = concrete_clusters_.find(node);
-      if (iter != concrete_clusters_.end() && iter->second != cluster) {
+      if ((iter != concrete_clusters_.end()) && (iter->second != cluster)) {
         q.push(iter->second);
       }
     }
diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_build_npu_op_optimizer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_build_npu_op_optimizer.cpp
index 5abc96bfe..dd26e7985 100644
--- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_build_npu_op_optimizer.cpp
+++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_build_npu_op_optimizer.cpp
@@ -113,7 +113,7 @@ tensorflow::Status TryToBuildShapeForDynDims(const std::map<std::string, std::st
     (options.find(ge::DYNAMIC_NODE_TYPE) == options.end()) ? "" : options.at(ge::DYNAMIC_NODE_TYPE);
   std::string dynamic_dims = (options.find(ge::kDynamicDims) == options.end()) ? "" : options.at(ge::kDynamicDims);
   bool need_dyn_proc = (!input_shapes.empty()) && (!dynamic_node_type.empty()) && (!dynamic_dims.empty());
-  if (!need_dyn_proc || dynamic_node_type == kDynamicNodeTypeData) {
+  if ((!need_dyn_proc) || (dynamic_node_type == kDynamicNodeTypeData)) {
     DLOG() << "Skip dynamic dims. Option configuration is complete:" << (need_dyn_proc ? "true" : "false")
            << ", dynamic_node_type(0 for GetNext, 1 for Data):" << dynamic_node_type;
     return tensorflow::Status::OK();
@@ -159,7 +159,7 @@ tensorflow::Status BuildNpuOpOptimize(TFE_Context *context, NpuMutableConcreteGr
   }
   if (!ss.str().empty()) {
     tensorflow::Node *key;
-    if (IsGraphNeedLoop(*(graph->MutableGraph()), key) || key != nullptr) {
+    if (IsGraphNeedLoop(*(graph->MutableGraph()), key) || (key != nullptr)) {
       graph->SetLoopType(NpuConcreteGraph::LoopType::BUILTIN_LOOP);
     }
     graph->SetExecutionType(NpuConcreteGraph::ExecutionType::MIX);
diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp
index 2bd9a99c4..6cb0a2bc5 100644
--- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp
+++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp
@@ -180,7 +180,8 @@ tensorflow::Status TransHasSubgraphNode(TFE_Context *context, tensorflow::Graph
     functions.emplace_back(const_cast<tensorflow::AttrValue*>(node->attrs().Find("then_branch"))->mutable_func());
     functions.emplace_back(const_cast<tensorflow::AttrValue*>(node->attrs().Find("else_branch"))->mutable_func());
   } else if (node->IsCaseNode()) {
-    for (auto &f : *const_cast<tensorflow::AttrValue*>(node->attrs().Find("branches"))->mutable_list()->mutable_func()) {
+    for (auto &f :
+         *const_cast<tensorflow::AttrValue *>(node->attrs().Find("branches"))->mutable_list()->mutable_func()) {
       functions.emplace_back(&f);
     }
   } else {
-- 
Gitee


From 20922eec6d1d0f13195abbac7231bf891d614397 Mon Sep 17 00:00:00 2001
From: xujiuxu <xujiuxu1@huawei.com>
Date: Thu, 23 Mar 2023 06:19:10 +0000
Subject: [PATCH 07/22] =?UTF-8?q?!2148=20=E5=91=8A=E8=AD=A6=E6=B8=85?=
 =?UTF-8?q?=E7=90=86=20Merge=20pull=20request=20!2148=20from=20xujiuxu/che?=
 =?UTF-8?q?rry-pick-1679477162?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/optimizers/frozen_variable_pass.cc | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tf_adapter/optimizers/frozen_variable_pass.cc b/tf_adapter/optimizers/frozen_variable_pass.cc
index c90832d11..02582727e 100644
--- a/tf_adapter/optimizers/frozen_variable_pass.cc
+++ b/tf_adapter/optimizers/frozen_variable_pass.cc
@@ -47,11 +47,11 @@ class FrozenVariablePass : public GraphOptimizationPass {
   ~FrozenVariablePass() override = default;
   Status Run(const GraphOptimizationPassOptions &options) override;
  private:
-  bool IsAllOutputsIdentity(const Node * const node);
-  bool IsAllOutputsReadOp(const Node * const node);
+  bool IsAllOutputsIdentity(const Node * const node) const;
+  bool IsAllOutputsReadOp(const Node * const node) const;
   bool IsNeedBuildPartitionedCall(const Node * const node);
-  std::map<std::string, std::string> GetGraphConfigs(const Graph &graph);
-  void RemoveDeadNodes(Graph* g);
+  std::map<std::string, std::string> GetGraphConfigs(const Graph &graph) const;
+  void RemoveDeadNodes(Graph* g) const;
   Status DoConstantFolding(const GraphOptimizationPassOptions &options, const uint64_t index);
 };
 
@@ -61,7 +61,7 @@ struct StableNodeCompartor {
 
 DataType EdgeDataType(const tensorflow::Edge &edge) { return edge.src()->output_type(edge.src_output()); }
 
-bool FrozenVariablePass::IsAllOutputsIdentity(const Node * const node) {
+bool FrozenVariablePass::IsAllOutputsIdentity(const Node * const node) const {
   for (auto out : node->out_nodes()) {
     if (!out->IsIdentity()) {
       return false;
@@ -70,7 +70,7 @@ bool FrozenVariablePass::IsAllOutputsIdentity(const Node * const node) {
   return true;
 }
 
-bool FrozenVariablePass::IsAllOutputsReadOp(const Node * const node) {
+bool FrozenVariablePass::IsAllOutputsReadOp(const Node * const node) const {
   for (auto out : node->out_nodes()) {
     if (out->type_string() != "ReadVariableOp") {
       return false;
@@ -84,7 +84,7 @@ bool FrozenVariablePass::IsNeedBuildPartitionedCall(const Node * const node) {
          (node->type_string() == "VarHandleOp" && IsAllOutputsReadOp(node));
 }
 
-std::map<std::string, std::string> FrozenVariablePass::GetGraphConfigs(const Graph &graph) {
+std::map<std::string, std::string> FrozenVariablePass::GetGraphConfigs(const Graph &graph) const {
   for (Node *n : graph.nodes()) {
     if ((n != nullptr) && (n->attrs().Find("_NpuOptimizer") != nullptr)) {
       return NpuAttrs::GetAllAttrOptions(n->attrs());
@@ -93,7 +93,7 @@ std::map<std::string, std::string> FrozenVariablePass::GetGraphConfigs(const Gra
   return {};
 }
 
-void FrozenVariablePass::RemoveDeadNodes(Graph* g) {
+void FrozenVariablePass::RemoveDeadNodes(Graph* g) const {
   std::unordered_set<const Node*> nodes;
   for (auto n : g->nodes()) {
     ADP_LOG(DEBUG) << "Remove dead node, node type: " << n->type_string();
@@ -106,7 +106,7 @@ void FrozenVariablePass::RemoveDeadNodes(Graph* g) {
 
 Status FrozenVariablePass::DoConstantFolding(const GraphOptimizationPassOptions &options,
         const uint64_t index) {
-  ADP_LOG(INFO) << "Before do const folding" << options.session_options->config.DebugString();
+  ADP_LOG(INFO) << "Before do const folding " << options.session_options->config.DebugString();
   if (options.device_set == nullptr) {
     return errors::Internal("Failed to get device set to run constant folding");
   }
@@ -139,7 +139,7 @@ Status FrozenVariablePass::DoConstantFolding(const GraphOptimizationPassOptions
   GraphOptimizer optimizer(opts);
   optimizer.Optimize(flr, flr->env(), flr->device(), options.graph, graph_optimizer_options);
   (void)RemoveDeadNodes((options.graph)->get());
-  ADP_LOG(INFO) << "After do const folding optimize";
+  ADP_LOG(INFO) << "After do const folding optimize.";
   if (kDumpGraph) {
     const std::string pbtxt_path = GetDumpPath() + "TF_AfterFrozenVariable_" + std::to_string(index) + ".pbtxt";
     tensorflow::GraphDef def;
-- 
Gitee


From a83df9cdfeee8370ef329a9839dea1ea40b8c9da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=93=E6=B6=9B?= <dengtao24@huawei.com>
Date: Thu, 23 Mar 2023 09:26:40 +0000
Subject: [PATCH 08/22] =?UTF-8?q?!2154=20tf2.x=E5=8E=BBranktable=20Merge?=
 =?UTF-8?q?=20pull=20request=20!2154=20from=20=E9=82=93=E6=B6=9B/cherry-pi?=
 =?UTF-8?q?ck-1679559289?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../inc/external/ge/ge_api_types.h            |  7 +++
 .../npu_device/core/npu_wrapper.cpp           | 17 ++++--
 .../npu_device/distribute/npu_callbacks.py    |  4 +-
 .../python/npu_device/npu_device.py           | 52 +++++++++++++++----
 tf_adapter_2.x/tests/st/adapter2_st.py        |  8 +--
 .../tests/stub/include/stub/defines.h         |  7 +++
 6 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/inc/graphengine/inc/external/ge/ge_api_types.h b/inc/graphengine/inc/external/ge/ge_api_types.h
index 8efcef921..e2d720bef 100644
--- a/inc/graphengine/inc/external/ge/ge_api_types.h
+++ b/inc/graphengine/inc/external/ge/ge_api_types.h
@@ -43,6 +43,13 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode";
 const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile";
 const char *const GE_AICPU_FLAG = "ge.aicpuFlag";
 const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath";
+
+const std::string OPTION_EXEC_CM_CHIEF_IP = "ge.cmChiefIp";
+const std::string OPTION_EXEC_CM_CHIEF_PORT = "ge.cmChiefPort";
+const std::string OPTION_EXEC_CM_CHIEF_DEVICE = "ge.cmChiefWorkerDevice";
+const std::string OPTION_EXEC_CM_WORKER_IP = "ge.cmWorkerIp";
+const std::string OPTION_EXEC_CM_WORKER_SIZE = "ge.cmWorkerSize";
+
 // Dump flag and para
 const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump";
 const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath";
diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
index 7ffadcdd7..675ee05e7 100644
--- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
+++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
@@ -117,7 +117,12 @@ const std::map<std::string, std::string> kConfigurableOptions = {
   {"graph_parallel_option_path", "ge.graphParallelOptionPath"},
   {"enable_graph_parallel", "ge.enableGraphParallel"},
   {"atomic_clean_policy", "ge.exec.atomicCleanPolicy"},
-  {"static_memory_policy", "ge.exec.staticMemoryPolicy"}};
+  {"static_memory_policy", "ge.exec.staticMemoryPolicy"},
+  {"_distribute.cm_chief_ip", ge::OPTION_EXEC_CM_CHIEF_IP},
+  {"_distribute.cm_chief_port", ge::OPTION_EXEC_CM_CHIEF_PORT},
+  {"_distribute.cm_chief_worker_device", ge::OPTION_EXEC_CM_CHIEF_DEVICE},
+  {"_distribute.cm_worker_ip", ge::OPTION_EXEC_CM_WORKER_IP},
+  {"_distribute.cm_worker_size", ge::OPTION_EXEC_CM_WORKER_SIZE}};
 }  // namespace
 
 #undef PYBIND11_CHECK_PYTHON_VERSION
@@ -128,6 +133,13 @@ std::unordered_set<std::string> npu_specify_ops_cache;
 constexpr uint32_t kDeviceSatModeLimit = 2U;
 }
 namespace npu {
+bool CheckIsDistribute(std::map<std::string, std::string> &global_options) {
+  return ((global_options.find(ge::OPTION_EXEC_RANK_TABLE_FILE) != global_options.end() &&
+           global_options.find(ge::OPTION_EXEC_RANK_ID) != global_options.end()) ||
+          (global_options.find(ge::OPTION_EXEC_CM_CHIEF_IP) != global_options.end() &&
+           global_options.find(ge::OPTION_EXEC_CM_CHIEF_PORT) != global_options.end() &&
+           global_options.find(ge::OPTION_EXEC_CM_CHIEF_DEVICE) != global_options.end()));
+}
 void ParseGlobalOptions(int device_index, const std::map<std::string, std::string> &user_options,
                         std::map<std::string, std::string> &global_options) {
   for (const auto &option : user_options) {
@@ -138,8 +150,7 @@ void ParseGlobalOptions(int device_index, const std::map<std::string, std::strin
       LOG(WARNING) << "Unrecognized graph engine option " << option.first << ":" << option.second;
     }
   }
-  if (global_options.find(ge::OPTION_EXEC_RANK_TABLE_FILE) != global_options.end() &&
-      global_options.find(ge::OPTION_EXEC_RANK_ID) != global_options.end()) {
+  if (CheckIsDistribute(global_options)) {
     const static std::string kTrue = "1";
     global_options[ge::OPTION_EXEC_DEPLOY_MODE] = "0";
     global_options[ge::OPTION_EXEC_IS_USEHCOM] = kTrue;
diff --git a/tf_adapter_2.x/python/npu_device/distribute/npu_callbacks.py b/tf_adapter_2.x/python/npu_device/distribute/npu_callbacks.py
index bfafab2dc..3b359265c 100644
--- a/tf_adapter_2.x/python/npu_device/distribute/npu_callbacks.py
+++ b/tf_adapter_2.x/python/npu_device/distribute/npu_callbacks.py
@@ -52,9 +52,9 @@ def broadcast_keras_model(model, root_rank=0):
 
 
 def get_ranksize():
-    if os.getenv("CM_WORK_SIZE") is not None and os.getenv("RANK_SIZE") is not None:
+    if os.getenv("CM_WORKER_SIZE") is not None and os.getenv("RANK_SIZE") is not None:
         raise ValueError("RANK_SIZE and CM_WORK_SIZE cannot be configured at the same time")
-    rank_size = os.getenv('RANK_SIZE') if os.getenv("RANK_SIZE") is not None else os.getenv('CM_WORK_SIZE', '1')
+    rank_size = os.getenv('RANK_SIZE') if os.getenv("RANK_SIZE") is not None else os.getenv('CM_WORKER_SIZE', '1')
     return rank_size
 
 
diff --git a/tf_adapter_2.x/python/npu_device/npu_device.py b/tf_adapter_2.x/python/npu_device/npu_device.py
index d7bca74db..053c40f7f 100644
--- a/tf_adapter_2.x/python/npu_device/npu_device.py
+++ b/tf_adapter_2.x/python/npu_device/npu_device.py
@@ -131,6 +131,38 @@ def enable_v1():
     tf.load_op_library(os.path.join(os.path.dirname(__file__), "compat", "v1", "_tf_adapter.so"))
 
 
+def get_ranksize():
+    if os.getenv("CM_WORKER_SIZE") is not None and os.getenv("RANK_SIZE") is not None:
+        raise ValueError("RANK_SIZE and CM_WORK_SIZE cannot be configured at the same time")
+    rank_size = os.getenv('RANK_SIZE') if os.getenv("RANK_SIZE") is not None else os.getenv('CM_WORKER_SIZE', '1')
+    return rank_size
+
+
+def set_cm_chief_worksize_env(global_kw_options=None, env_cm_chief_ip=None, workers_num=None):
+    env_cm_chief_port = os.getenv("CM_CHIEF_PORT")
+    env_cm_chief_device = os.getenv("CM_CHIEF_DEVICE")
+    env_cm_worker_ip = os.getenv("CM_WORKER_IP")
+    if not env_cm_chief_port:
+        raise RuntimeError('You must specify cm chief port by set env CM_CHIEF_PORT in distribution mode')
+    if not env_cm_chief_device:
+        raise RuntimeError('You must specify cm chief device by set env CM_CHIEF_DEVICE in distribution mode')
+    if not env_cm_chief_ip:
+        raise RuntimeError('You must specify cm chief ip by set env CM_CHIEF_IP in distribution mode')
+    global_kw_options['_distribute.cm_chief_ip'] = env_cm_chief_ip
+    global_kw_options['_distribute.cm_chief_port'] = env_cm_chief_port
+    global_kw_options['_distribute.cm_chief_worker_device'] = env_cm_chief_device
+    global_kw_options['_distribute.cm_worker_ip'] = env_cm_worker_ip
+    global_kw_options['_distribute.cm_worker_size'] = str(workers_num)
+
+
+def set_rank_table_file_env(global_kw_options=None, env_rank_table=None):
+    env_worker_id = os.getenv('RANK_ID')
+    if not env_worker_id:
+        raise RuntimeError('You must specify rank id by set env RANK_ID in distribution mode')
+    global_kw_options['_distribute.rank_table'] = env_rank_table
+    global_kw_options['_distribute.rank_id'] = env_worker_id
+
+
 def open(device_id=None):
     """Initiate and return a NPU device handle"""
     if device_id is None:
@@ -152,18 +184,18 @@ def open(device_id=None):
                                ''.format(device_id, list(_npu_device_instances.keys())))
 
         global_kw_options = global_options().as_dict()
-        workers_num = int(os.getenv('RANK_SIZE', '1'))
+        workers_num = int(get_ranksize())
         if workers_num > 1:
+            env_cm_chief_ip = os.getenv("CM_CHIEF_IP")
             env_rank_table = os.getenv("RANK_TABLE_FILE")
-            env_worker_id = os.getenv('RANK_ID')
-            if not env_rank_table:
-                raise RuntimeError('You must specify a rank table file by set env RANK_TABLE_FILE in distribution mode')
-
-            if not env_worker_id:
-                raise RuntimeError('You must specify rank id by set env RANK_ID in distribution mode')
-
-            global_kw_options['_distribute.rank_table'] = env_rank_table
-            global_kw_options['_distribute.rank_id'] = env_worker_id
+            if env_cm_chief_ip is not None and env_rank_table is not None:
+                raise RuntimeError('CM_CHIEF_IP and RANK_TABLE_FILE cannot be configured at the same time.')
+            elif env_cm_chief_ip is not None:
+                set_cm_chief_worksize_env(global_kw_options, env_cm_chief_ip, workers_num)
+            elif env_rank_table is not None:
+                set_rank_table_file_env(global_kw_options, env_rank_table)
+            else:
+                raise RuntimeError('CM_CHIEF_IP and RANK_TABLE_FILE are all not be configured.')
 
         device_options = {}
         error_message = _npu_device_backends.Open(context.context()._handle, NPU, device_id, global_kw_options,
diff --git a/tf_adapter_2.x/tests/st/adapter2_st.py b/tf_adapter_2.x/tests/st/adapter2_st.py
index 1b28a15cb..03663ac70 100644
--- a/tf_adapter_2.x/tests/st/adapter2_st.py
+++ b/tf_adapter_2.x/tests/st/adapter2_st.py
@@ -35,9 +35,11 @@ npu_device.global_options().experimental.multi_branches_config.dynamic_node_type
 npu_device.global_options().experimental.multi_branches_config.dynamic_dims = "1;2"
 npu_device.global_options().aoe_config.work_path = "./"
 npu_device.global_options().graph_run_mode = 0
-os.environ['RANK_TABLE_FILE'] = "rankTable"
-os.environ['RANK_SIZE'] = "2"
-os.environ['RANK_ID'] = "1"
+os.environ['CM_CHIEF_IP'] = "1"
+os.environ['CM_CHIEF_PORT'] = "3"
+os.environ['CM_CHIEF_DEVICE'] = "4"
+os.environ['CM_WORKER_SIZE'] = "2"
+os.environ['CM_WORKER_IP'] = "123"
 npu = npu_device.open().as_default()
 npu.workers_num = 2  # mock run in 2P env
 
diff --git a/tf_adapter_2.x/tests/stub/include/stub/defines.h b/tf_adapter_2.x/tests/stub/include/stub/defines.h
index 5be2c1243..653de3c59 100644
--- a/tf_adapter_2.x/tests/stub/include/stub/defines.h
+++ b/tf_adapter_2.x/tests/stub/include/stub/defines.h
@@ -25,6 +25,13 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode";
 const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile";
 const char *const GE_AICPU_FLAG = "ge.aicpuFlag";
 const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath";
+
+const std::string OPTION_EXEC_CM_CHIEF_IP = "ge.cmChiefIp";
+const std::string OPTION_EXEC_CM_CHIEF_PORT = "ge.cmChiefPort";
+const std::string OPTION_EXEC_CM_CHIEF_DEVICE = "ge.cmChiefWorkerDevice";
+const std::string OPTION_EXEC_CM_WORKER_IP = "ge.cmWorkerIp";
+const std::string OPTION_EXEC_CM_WORKER_SIZE = "ge.cmWorkerSize";
+
 // Dump flag and para
 const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump";
 const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath";
-- 
Gitee


From 1219621bb8ec26c53f9d505f65855d8b6a89acc2 Mon Sep 17 00:00:00 2001
From: caiguangxing <caiguangxing@hisilicon.com>
Date: Fri, 24 Mar 2023 09:50:52 +0000
Subject: [PATCH 09/22] =?UTF-8?q?!2147=20=E5=91=8A=E8=AD=A6=E6=B8=85?=
 =?UTF-8?q?=E7=90=86=E5=90=88=E5=85=A5C29=20Merge=20pull=20request=20!2147?=
 =?UTF-8?q?=20from=20caiguangxing/cherry-pick-1679473773?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/kernels/geop_npu.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc
index fc8538f00..fd5d3a630 100644
--- a/tf_adapter/kernels/geop_npu.cc
+++ b/tf_adapter/kernels/geop_npu.cc
@@ -844,7 +844,7 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
 
     // convert to ge::graph
     if (graph_options_.count("input_format") != 0) {
-      ADP_LOG(INFO) << "graph_options_[\"input_format\"]: " << graph_options_["input_format"];
+      ADP_LOG(INFO) << "graph_options_[\"input_format\"] = " << graph_options_["input_format"];
     }
     ge::Graph ge_graph = ge::GraphUtilsEx::CreateGraphFromComputeGraph(compute_graph);
     if (iteration_per_loop_ > 1) {
@@ -955,8 +955,8 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
   };
 
   // call ge session runGraphAsync api
-  ADP_LOG(INFO) << "[GEOP] Call ge session RunGraphAsync, kernel_name:" << geop_name << " ,tf session: " << tf_session_
-                << " ,graph id: " << cache_graph_id;
+  ADP_LOG(INFO) << "[GEOP] Call ge session RunGraphAsync, kernel_name: " << geop_name << ", tf session: " << tf_session_
+                << ", graph id: " << cache_graph_id;
   ge::Status run_graph_status = ge_session_->RunGraphAsync(cache_graph_id, inputs, callback);
   if (run_graph_status != ge::SUCCESS) {
     std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime));
@@ -974,7 +974,7 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
   endTime = InferShapeUtil::GetCurrentTimestap();
   ADP_LOG(INFO) << "[GEOP] End GeOp::ComputeAsync, kernel_name: " << geop_name
                 << ", ret_status: " << ToString(run_graph_status) << ", tf session : " << tf_session_
-                << " ,graph id: " << cache_graph_id << "[" << ((endTime - startTime) / kMicrosToMillis) << " ms]";
+                << ", graph id: " << cache_graph_id << "[" << ((endTime - startTime) / kMicrosToMillis) << " ms]";
   return;
 }
 
@@ -1422,7 +1422,7 @@ void GeOp::BuildShapeNodeAndCacheArgNodes(Graph &graph) {
       if (node->name().find("IteratorGetNext_") != std::string::npos) {
         if (dynamic_node_type == "0") {
           dynamic_shape_nodes_.emplace_back(node);
-          ADP_LOG(INFO) << "push in dynamic shape nodes, node : " <<  node->name() << ", type : " << node->type_string();
+          ADP_LOG(INFO) << "push in dynamic shape nodes, node : " << node->name() << ", type : " << node->type_string();
         }
       } else {
         if (dynamic_node_type == "1") {
-- 
Gitee


From f46b950f9ea48dcfc9be785fab4b06f878209133 Mon Sep 17 00:00:00 2001
From: guopeian <guopeian1@hisilicon.com>
Date: Mon, 27 Mar 2023 02:56:15 +0000
Subject: [PATCH 10/22] =?UTF-8?q?!2159=20tensorflow2.x=E6=89=8B=E5=8A=A8?=
 =?UTF-8?q?=E9=94=80=E6=AF=81context=20Merge=20pull=20request=20!2159=20fr?=
 =?UTF-8?q?om=20guopeian/context=5Fc29?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter_2.x/npu_device/core/npu_global.cpp | 29 ++++++++++++++-----
 tf_adapter_2.x/npu_device/core/npu_global.h   |  6 ++--
 .../npu_device/core/npu_wrapper.cpp           |  9 ++----
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/tf_adapter_2.x/npu_device/core/npu_global.cpp b/tf_adapter_2.x/npu_device/core/npu_global.cpp
index a798941d8..cd365edd7 100644
--- a/tf_adapter_2.x/npu_device/core/npu_global.cpp
+++ b/tf_adapter_2.x/npu_device/core/npu_global.cpp
@@ -39,25 +39,40 @@ std::unordered_set<std::string> g_npu_specify_ops;
 tensorflow::mutex dev_memory_shared_lock;
 bool dev_memory_released = false;
 
-void RtsCtx::SetGlobalCtx(aclrtContext global_ctx) {
-  static std::atomic_bool already_set{false};
-  if (!already_set.exchange(true)) {
-    global_ctx_ = global_ctx;
-    global_ctx_set_ = true;
+tensorflow::Status RtsCtx::CreateGlobalCtx(int32_t device_index) {
+  {
+    tensorflow::tf_shared_lock read_lock(global_ctx_mutex_);
+    if (global_ctx_ != nullptr) {
+      DLOG() << "Global context has been created.";
+      return tensorflow::Status::OK();
+    }
   }
+  tensorflow::mutex_lock write_lock(global_ctx_mutex_);
+  NPU_REQUIRES_ACL_OK("Acl create rts ctx failed", aclrtCreateContext(&global_ctx_, device_index));
+  return tensorflow::Status::OK();
 }
 
 // 存在rtMalloc和rtFree在不同线程操作的情况，也存在同一线程会切换context的场景
 // 这里保证全局唯一的ctx，且对device资源操作时都设置这个全局ctx
 tensorflow::Status RtsCtx::EnsureInitialized() {
-  if (global_ctx_set_) {
+  tensorflow::tf_shared_lock read_lock(global_ctx_mutex_);
+  if (global_ctx_ != nullptr) {
     NPU_REQUIRES_ACL_OK("Acl set current thread ctx failed", aclrtSetCurrentContext(global_ctx_));
   }
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status RtsCtx::DestroyGlobalCtx() {
+  tensorflow::mutex_lock write_lock(global_ctx_mutex_);
+  if (global_ctx_ != nullptr) {
+    NPU_REQUIRES_ACL_OK("Acl Destroy global ctx failed", aclrtDestroyContext(global_ctx_));
+  }
+  global_ctx_ = nullptr;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::mutex RtsCtx::global_ctx_mutex_;
 aclrtContext RtsCtx::global_ctx_{nullptr};
-std::atomic_bool RtsCtx::global_ctx_set_{false};
 
 std::map<int, NpuCtx::Ctx> NpuCtx::npu_ctx_;
 
diff --git a/tf_adapter_2.x/npu_device/core/npu_global.h b/tf_adapter_2.x/npu_device/core/npu_global.h
index 194badbc5..1b04e7666 100644
--- a/tf_adapter_2.x/npu_device/core/npu_global.h
+++ b/tf_adapter_2.x/npu_device/core/npu_global.h
@@ -41,12 +41,12 @@ extern bool dev_memory_released TF_GUARDED_BY(dev_memory_shared_lock);
 // Rts ctx管理器
 class RtsCtx {
  public:
-  static void SetGlobalCtx(aclrtContext global_ctx);
+  static tensorflow::Status CreateGlobalCtx(int32_t device_index);
   static tensorflow::Status EnsureInitialized();
-
+  static tensorflow::Status DestroyGlobalCtx();
  private:
   static aclrtContext global_ctx_;
-  static std::atomic_bool global_ctx_set_;
+  static tensorflow::mutex global_ctx_mutex_;
 };
 
 class NpuCtx {
diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
index 675ee05e7..4a8dd0659 100644
--- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
+++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
@@ -205,16 +205,10 @@ PYBIND11_MODULE(_npu_device_backends, m) {
                       return status.error_message();
                     }
                   }
-
-                  aclrtContext global_rt_ctx = nullptr;
-                  auto status = [&global_rt_ctx, device_index]() -> tensorflow::Status {
-                    NPU_REQUIRES_ACL_OK("Acl create rts ctx failed", aclrtCreateContext(&global_rt_ctx, device_index));
-                    return tensorflow::Status::OK();
-                  }();
+                  auto status = npu::global::RtsCtx::CreateGlobalCtx(device_index);
                   if (!status.ok()) {
                     return status.error_message();
                   }
-                  npu::global::RtsCtx::SetGlobalCtx(global_rt_ctx);
                   status = npu::global::RtsCtx::EnsureInitialized();
                   if (!status.ok()) {
                     return status.error_message();
@@ -262,6 +256,7 @@ PYBIND11_MODULE(_npu_device_backends, m) {
       }
 
       (void)npu::NpuAoe::GetInstance().AoeTuningFinalize();
+      (void)npu::global::RtsCtx::DestroyGlobalCtx();
     }
     pybind11::gil_scoped_acquire acquire;
   });
-- 
Gitee


From 013ab4da3bc06c51e43b043cd16679c151eb6f7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=99=8F=E5=90=8D=E9=A6=99?= <yanmingxiang@huawei.com>
Date: Tue, 28 Mar 2023 06:51:46 +0000
Subject: [PATCH 11/22] =?UTF-8?q?!2169=20=E4=BC=98=E5=8C=96dtstring?=
 =?UTF-8?q?=E7=B1=BB=E5=9E=8B=E6=80=A7=E8=83=BD=20Merge=20pull=20request?=
 =?UTF-8?q?=20!2169=20from=20=E6=99=8F=E5=90=8D=E9=A6=99/r1.12.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/kernels/aicpu/host_queue_dataset_op.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc
index f661ffdc5..070404811 100644
--- a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc
+++ b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc
@@ -600,11 +600,12 @@ class HostQueueDatasetOp : public DatasetOpKernel {
       void RecordMbufQueueBytes(const bool is_hold, const uint64_t args_total_bytes) {
         if (!is_hold) { return; }
         mbuf_queue_rear_ = (mbuf_queue_rear_ + 1) % kStringTypeDepth;
+        mbuf_queue_total_bytes_ = mbuf_queue_total_bytes_ - mbuf_queue_bytes_[mbuf_queue_rear_] + args_total_bytes;
         mbuf_queue_bytes_[mbuf_queue_rear_] = args_total_bytes;
       }
 
       bool IsHoldDataTrans() {
-        if (!is_hold_type) { return false; }
+        if (mbuf_queue_total_bytes_ < static_cast<uint64_t>(kMaxBytes)) { return false; }
         size_t mbuf_size;
         aclError status = acltdtQueryChannelSize(acl_handle_, &mbuf_size);
         if (status != ACL_SUCCESS) {
@@ -1039,7 +1040,8 @@ class HostQueueDatasetOp : public DatasetOpKernel {
         double elapsed_time = 0;
         uint64_t total_bytes = 0;
       } data_thread_perf_stat_[static_cast<size_t>(ThreadType::BUTT)];
-      uint64_t mbuf_queue_bytes_[kStringTypeDepth];
+      uint64_t mbuf_queue_bytes_[kStringTypeDepth] = { 0 };
+      uint64_t mbuf_queue_total_bytes_ = 0;
       size_t mbuf_queue_rear_ = 0;
     };
     const std::vector<DatasetBase *> inputs_;
-- 
Gitee


From 462218842aa9a52683599f669521eec8b54e584f Mon Sep 17 00:00:00 2001
From: yaolun <yaolun2@hisilicon.com>
Date: Wed, 29 Mar 2023 00:55:34 +0000
Subject: [PATCH 12/22] =?UTF-8?q?!2172=20GetNext=E8=B6=85=E6=97=B6?=
 =?UTF-8?q?=E6=97=A5=E5=BF=97=E4=B8=8A=E6=8A=A5=20Merge=20pull=20request?=
 =?UTF-8?q?=20!2172=20from=20yaolun/florence=5Fc29?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../depends/ascendcl/src/ascendcl_stub.cc     |  8 ++++
 .../depends/ascendcl/src/ascendcl_stub.h      |  3 ++
 .../dataset/host_queue_dats_set_st.cc         | 46 +++++++++++++++++++
 .../dataset/host_queue_dats_set_ut.cc         | 46 +++++++++++++++++++
 tf_adapter/util/acl_channel.cc                |  5 ++
 5 files changed, 108 insertions(+)

diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc
index a53675814..8685e17d7 100644
--- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc
+++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc
@@ -234,9 +234,17 @@ aclError acltdtAddDataItem(acltdtDataset *dataset, acltdtDataItem *dataItem) {
     return ACL_SUCCESS;
 }
 
+bool gAclTdtSendTensorMock = false;
+void setAclTdtSendTensorMockStub(const bool isDriverSuccess) {
+  gAclTdtSendTensorMock = isDriverSuccess;
+}
+
 aclError acltdtSendTensor(const acltdtChannelHandle *handle,
                           const acltdtDataset *dataset,
                           int32_t timeout) {
+    if (gAclTdtSendTensorMock) {
+      return ACL_ERROR_DRV_FAILURE;
+    }
     if (dataset == nullptr || handle == nullptr) {
         return ACL_ERROR_INVALID_PARAM;
     }
diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h
index bb3f61969..a7df8034b 100644
--- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h
+++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h
@@ -36,6 +36,9 @@ void SetTensorDescSize(uint32_t val);
 extern bool g_loadModelStatus;
 void SetAclLoadModelFlag(bool load_status);
 
+extern bool gAclTdtSendTensorMock;
+void setAclTdtSendTensorMockStub(const bool isSuccess);
+
 struct acltdtDataItem {
     acltdtDataItem(acltdtTensorType tdtType,
         const int64_t *dims, size_t dimNum, const std::string &dimsStr,
diff --git a/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc b/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc
index c501769d1..a7f704120 100644
--- a/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc
+++ b/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc
@@ -673,6 +673,52 @@ TEST_F(HostQueueDatasetOpTest, isholddatatrans3) {
                                  &end_of_sequence));
 }
 
+TEST_F(HostQueueDatasetOpTest, senddata_driver_error) {
+  setAclTdtSendTensorMockStub(true);
+  NpuAttrs::SetNewDataTransferFlag(true);
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = NormalizeTestCase();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensorForQueue(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+
+  gtl::InlinedVector<TensorValue, 4> inputs_for_host_queue_dataset(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&tensor_slice_dataset_tensor)});
+
+  std::unique_ptr<OpKernel> host_queue_dataset_kernel;
+  TF_ASSERT_OK(CreateHostQueueDatasetKernel(test_case.expected_output_dtypes,
+                                            test_case.expected_output_shapes,
+                                            &host_queue_dataset_kernel, "-1"));
+  std::unique_ptr<OpKernelContext> host_queue_dataset_context;
+  TF_ASSERT_OK(CreateHostQueueDatasetContext(host_queue_dataset_kernel.get(),
+                                             &inputs_for_host_queue_dataset,
+                                             &host_queue_dataset_context));
+  DatasetBase *host_queue_dataset;
+  TF_ASSERT_OK(CreateDataset(host_queue_dataset_kernel.get(),
+                             host_queue_dataset_context.get(),
+                             &host_queue_dataset));
+  core::ScopedUnref scoped_unref(host_queue_dataset);
+
+  SerializationContext context(SerializationContext::Params{});
+  GraphDefBuilder b;
+  DatasetBase::DatasetGraphDefBuilder db(&b);
+  Node *output;
+  host_queue_dataset->AsGraphDefInternal(&context, &db, &output);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(host_queue_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(host_queue_dataset->MakeIterator(iterator_context.get(),
+                                                "Iterator", &iterator));
+  sleep(2);
+  setAclTdtSendTensorMockStub(false);
+}
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc b/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc
index 1c92176d8..74600dd63 100644
--- a/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc
+++ b/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc
@@ -816,6 +816,52 @@ TEST_F(HostQueueDatasetOpTest, isholddatatrans3) {
                                  &end_of_sequence));
 }
 
+TEST_F(HostQueueDatasetOpTest, senddata_driver_error) {
+  setAclTdtSendTensorMockStub(true);
+  NpuAttrs::SetNewDataTransferFlag(true);
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = NormalizeTestCase();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensorForQueue(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+
+  gtl::InlinedVector<TensorValue, 4> inputs_for_host_queue_dataset(
+      {TensorValue(&tensor_slice_dataset_tensor),
+       TensorValue(&tensor_slice_dataset_tensor)});
+
+  std::unique_ptr<OpKernel> host_queue_dataset_kernel;
+  TF_ASSERT_OK(CreateHostQueueDatasetKernel(test_case.expected_output_dtypes,
+                                            test_case.expected_output_shapes,
+                                            &host_queue_dataset_kernel, "-1"));
+  std::unique_ptr<OpKernelContext> host_queue_dataset_context;
+  TF_ASSERT_OK(CreateHostQueueDatasetContext(host_queue_dataset_kernel.get(),
+                                             &inputs_for_host_queue_dataset,
+                                             &host_queue_dataset_context));
+  DatasetBase *host_queue_dataset;
+  TF_ASSERT_OK(CreateDataset(host_queue_dataset_kernel.get(),
+                             host_queue_dataset_context.get(),
+                             &host_queue_dataset));
+  core::ScopedUnref scoped_unref(host_queue_dataset);
+
+  SerializationContext context(SerializationContext::Params{});
+  GraphDefBuilder b;
+  DatasetBase::DatasetGraphDefBuilder db(&b);
+  Node *output;
+  host_queue_dataset->AsGraphDefInternal(&context, &db, &output);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(host_queue_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(host_queue_dataset->MakeIterator(iterator_context.get(),
+                                                "Iterator", &iterator));
+  sleep(2);
+  setAclTdtSendTensorMockStub(false);
+}
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tf_adapter/util/acl_channel.cc b/tf_adapter/util/acl_channel.cc
index 193d03517..dd42b9b1b 100644
--- a/tf_adapter/util/acl_channel.cc
+++ b/tf_adapter/util/acl_channel.cc
@@ -22,6 +22,7 @@
 #include "tf_adapter/common/compat_tf1_tf2.h"
 #include "tf_adapter/util/npu_attrs.h"
 #include "tf_adapter/util/util.h"
+#include "ge/ge_api.h"
 namespace tensorflow {
 Status MappingTfDtypeToAcl(const tensorflow::DataType tf_type, aclDataType &acl_type) {
   const static std::map<tensorflow::DataType, aclDataType> type_mapping = {
@@ -251,6 +252,10 @@ Status SendTensorsByAcl(const acltdtChannelHandle *acl_handle, acltdtTensorType
     return Status::OK();
   }
   if (acl_status != ACL_ERROR_NONE) {
+    std::string error_message = ge::GEGetErrorMsg();
+    LOG(ERROR) << "Failed to send data by acl, error code : "<< acl_status << std::endl
+               << "Error Message is " << std::endl
+               << error_message;
     return errors::Internal("Acl send data failed, acl status:", acl_status);
   }
   return Status::OK();
-- 
Gitee


From cae7047df9667551fc684b4bba91959861385cce Mon Sep 17 00:00:00 2001
From: huanruizhi <huanruizhi@hisilicon.com>
Date: Thu, 30 Mar 2023 06:08:14 +0000
Subject: [PATCH 13/22] =?UTF-8?q?!2176=20=E9=9D=99=E6=80=81=E6=88=90?=
 =?UTF-8?q?=E5=91=98=E5=8A=A0=E9=94=81=E5=90=8C=E6=AD=A5=E5=88=B0=E5=88=86?=
 =?UTF-8?q?=E6=94=AF=20Merge=20pull=20request=20!2176=20from=20huanruizhi/?=
 =?UTF-8?q?r1.12=5Fdev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/util/npu_attrs.cc | 182 ++++++++++++++++++-----------------
 tf_adapter/util/npu_attrs.h  |   2 +
 2 files changed, 95 insertions(+), 89 deletions(-)

diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc
index 689a406b8..caf83d9aa 100644
--- a/tf_adapter/util/npu_attrs.cc
+++ b/tf_adapter/util/npu_attrs.cc
@@ -15,7 +15,6 @@
  */
 
 #include "tf_adapter/util/npu_attrs.h"
-#include <mutex>
 #include <regex>
 #include <iostream>
 #include <sstream>
@@ -43,6 +42,7 @@ std::map<int32_t, bool> NpuAttrs::turn_on_tdt_info_;
 std::map<std::string, bool> NpuAttrs::use_adp_info_;
 std::map<std::string, bool> NpuAttrs::dataset_execute_info_;
 std::map<std::string, std::string> NpuAttrs::init_options_;
+std::mutex NpuAttrs::mutex_;
 const static int32_t kRuntimeTypeHeterogeneous = 1;
 
 bool NpuAttrs::CheckIsNewDataTransfer() {
@@ -624,6 +624,7 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
     (void) ctx->GetAttr("_event_sync_timeout", &event_sync_timeout);
   }
 
+  std::lock_guard<std::mutex> lock(mutex_);
   if (precision_mode.empty()) {
     init_options_[ge::PRECISION_MODE] = "allow_fp32_to_fp16";
   } else {
@@ -677,6 +678,7 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
 }
 
 std::map<std::string, std::string> NpuAttrs::GetInitOptions() {
+  std::lock_guard<std::mutex> lock(mutex_);
   return init_options_;
 }
 
@@ -1554,6 +1556,11 @@ std::map<std::string, std::string> NpuAttrs::GetDefaultPassOptions() {
 }
 
 Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options, Node *node) {
+  if (!node) {
+    ADP_LOG(ERROR) << "node is null.";
+    LOG(ERROR) << "node is null.";
+    return errors::Internal("node is null.");
+  }
   std::map<std::string, std::string> sess_options;
   bool hcom_parallel = true;
   std::string graph_memory_max_size;
@@ -1639,9 +1646,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   bool external_weight = false;
   bool frozen_variable = false;
   std::string variable_location = "Device";
+  int64_t op_debug_level = 0;
 
-  const RewriterConfig &rewrite_options =
-    options.session_options->config.graph_options().rewrite_options();
+  const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options();
   for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) {
     if (custom_optimizer.name() == "NpuOptimizer") {
       const auto &params = custom_optimizer.parameter_map();
@@ -1736,9 +1743,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
         }
       }
       if (params.count("op_debug_level") > 0) {
-        int64_t op_debug_level = params.at("op_debug_level").i();
-        init_options_["op_debug_level"] = std::to_string(op_debug_level);
-        init_options_[ge::OP_DEBUG_LEVEL] = std::to_string(op_debug_level);
+        op_debug_level = params.at("op_debug_level").i();
         LOG_DEPRECATED_WITH_REPLACEMENT(op_debug_level, op_debug_config);
       }
       if (params.count("enable_scope_fusion_passes") > 0) {
@@ -2120,78 +2125,87 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   sess_options["external_weight"] = std::to_string(static_cast<int32_t>(external_weight));
   sess_options["ge.externalWeight"] = std::to_string(static_cast<int32_t>(external_weight));
 
-  init_options_["precision_mode"] = precision_mode;
-  if (precision_mode.empty()) {
-    init_options_[ge::PRECISION_MODE] = "allow_fp32_to_fp16";
-  } else {
-    init_options_[ge::PRECISION_MODE] = precision_mode;
-  }
-  init_options_["profiling_mode"] = std::to_string(static_cast<int32_t>(profiling_mode));
-  init_options_[ge::OPTION_EXEC_PROFILING_MODE] = std::to_string(static_cast<int32_t>(profiling_mode));
-  init_options_["profiling_options"] = profiling_options;
-  init_options_[ge::OPTION_EXEC_PROFILING_OPTIONS] = profiling_options;
-  init_options_["ge.autoTuneMode"] = auto_tune_mode;
-  init_options_["graph_run_mode"] = std::to_string(graph_run_mode);
-  init_options_[ge::OPTION_GRAPH_RUN_MODE] = std::to_string(graph_run_mode);
-  init_options_["enable_scope_fusion_passes"] = enable_scope_fusion_passes;
-  init_options_[ge::OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES] = enable_scope_fusion_passes;
-  init_options_["enable_exception_dump"] = std::to_string(enable_exception_dump);
-  init_options_["ge.exec.enable_exception_dump"] = std::to_string(enable_exception_dump);
-  init_options_["ge.deterministic"] = std::to_string(deterministic);
-  init_options_["aoe_mode"] = aoe_mode;
-  init_options_["ge.jobType"] = aoe_mode;
-  init_options_["work_path"] = work_path;
-  init_options_["ge.tuningPath"] = work_path;
-  init_options_["distribute_config"] = distribute_config;
-  init_options_["op_compiler_cache_mode"] = op_compiler_cache_mode;
-  init_options_["ge.op_compiler_cache_mode"] = op_compiler_cache_mode;
-  init_options_["op_compiler_cache_dir"] = op_compiler_cache_dir;
-  init_options_["ge.op_compiler_cache_dir"] = op_compiler_cache_dir;
-  init_options_["debug_dir"] = debug_dir;
-  init_options_["ge.debugDir"] = debug_dir;
-  init_options_["device_type"] = device_type;
-  init_options_["ge.deviceType"] = device_type;
-  init_options_["soc_config"] = soc_config;
-  if (!soc_config.empty()) {
-    init_options_["ge.socVersion"] = soc_config;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    init_options_["precision_mode"] = precision_mode;
+    if (precision_mode.empty()) {
+      init_options_[ge::PRECISION_MODE] = "allow_fp32_to_fp16";
+    } else {
+      init_options_[ge::PRECISION_MODE] = precision_mode;
+    }
+    init_options_["op_debug_level"] = std::to_string(op_debug_level);
+    init_options_[ge::OP_DEBUG_LEVEL] = std::to_string(op_debug_level);
+    init_options_["profiling_mode"] = std::to_string(static_cast<int32_t>(profiling_mode));
+    init_options_[ge::OPTION_EXEC_PROFILING_MODE] = std::to_string(static_cast<int32_t>(profiling_mode));
+    init_options_["profiling_options"] = profiling_options;
+    init_options_[ge::OPTION_EXEC_PROFILING_OPTIONS] = profiling_options;
+    init_options_["ge.autoTuneMode"] = auto_tune_mode;
+    init_options_["graph_run_mode"] = std::to_string(graph_run_mode);
+    init_options_[ge::OPTION_GRAPH_RUN_MODE] = std::to_string(graph_run_mode);
+    init_options_["enable_scope_fusion_passes"] = enable_scope_fusion_passes;
+    init_options_[ge::OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES] = enable_scope_fusion_passes;
+    init_options_["enable_exception_dump"] = std::to_string(enable_exception_dump);
+    init_options_["ge.exec.enable_exception_dump"] = std::to_string(enable_exception_dump);
+    init_options_["ge.deterministic"] = std::to_string(deterministic);
+    init_options_["aoe_mode"] = aoe_mode;
+    init_options_["ge.jobType"] = aoe_mode;
+    init_options_["work_path"] = work_path;
+    init_options_["ge.tuningPath"] = work_path;
+    init_options_["distribute_config"] = distribute_config;
+    init_options_["op_compiler_cache_mode"] = op_compiler_cache_mode;
+    init_options_["ge.op_compiler_cache_mode"] = op_compiler_cache_mode;
+    init_options_["op_compiler_cache_dir"] = op_compiler_cache_dir;
+    init_options_["ge.op_compiler_cache_dir"] = op_compiler_cache_dir;
+    init_options_["debug_dir"] = debug_dir;
+    init_options_["ge.debugDir"] = debug_dir;
+    init_options_["device_type"] = device_type;
+    init_options_["ge.deviceType"] = device_type;
+    init_options_["soc_config"] = soc_config;
+    if (!soc_config.empty()) {
+      init_options_["ge.socVersion"] = soc_config;
+    }
+    init_options_["op_wait_timeout"] = op_wait_timeout;
+    init_options_["ge.exec.opWaitTimeout"] = op_wait_timeout;
+    init_options_["op_execute_timeout"] = op_execute_timeout;
+    init_options_["ge.exec.opExecuteTimeout"] = op_execute_timeout;
+    init_options_["customize_dtypes"] = customize_dtypes;
+    init_options_["ge.customizeDtypes"] = customize_dtypes;
+    init_options_["op_debug_config"] = op_debug_config;
+    init_options_["ge.exec.opDebugConfig"] = op_debug_config;
+    init_options_["static_memory_policy"] = static_memory_policy;
+    // Commercial version has been released, temporarily used
+    init_options_["GE_USE_STATIC_MEMORY"] = static_memory_policy;
+    init_options_["ge.exec.staticMemoryPolicy"] = static_memory_policy;
+
+    init_options_["ge.hcomMultiMode"] = std::to_string(hcom_multi_mode);
+    init_options_[ge::MODIFY_MIXLIST] = modify_mixlist;
+    init_options_["ge.fusionSwitchFile"] = fusion_switch_file;
+    init_options_[ge::OP_PRECISION_MODE] = op_precision_mode;
+    init_options_[ge::OP_SELECT_IMPL_MODE] = op_select_implmode;
+    init_options_[ge::OPTYPELIST_FOR_IMPLMODE] = optypelist_for_implmode;
+    init_options_["ge.exec.hcclExecuteTimeOut"] = hccl_timeout;
+    init_options_["HCCL_algorithm"] = HCCL_algorithm;
+    init_options_["graph_exec_timeout"] = std::to_string(graph_exec_timeout);
+    init_options_["ge.exec.graphExecTimeout"] = std::to_string(graph_exec_timeout);
+    init_options_["logical_device_cluster_deploy_mode"] = logical_device_cluster_deploy_mode;
+    init_options_["ge.exec.logicalDeviceClusterDeployMode"] = logical_device_cluster_deploy_mode;
+    init_options_["logical_device_id"] = logical_device_id;
+    init_options_["ge.exec.logicalDeviceId"] = logical_device_id;
+    init_options_["model_deploy_mode"] = model_deploy_mode;
+    init_options_["ge.exec.modelDeployMode"] = model_deploy_mode;
+    init_options_["model_deploy_devicelist"] = model_deploy_devicelist;
+    init_options_["ge.exec.modelDeployDevicelist"] = model_deploy_devicelist;
+    init_options_["dump_data"] = dump_data;
+    init_options_["ge.exec.dumpData"] = dump_data;
+    init_options_["aoe_config_file"] = aoe_config_file;
+    init_options_["ge.aoe_config_file"] = aoe_config_file;
+    init_options_["stream_sync_timeout"] = std::to_string(stream_sync_timeout);
+    init_options_["event_sync_timeout"] = std::to_string(event_sync_timeout);
+    for (const auto &option : init_options_) {
+      std::string attr_name = std::string("_") + option.first;
+      node->AddAttr(attr_name, option.second);
+    }
   }
-  init_options_["op_wait_timeout"] = op_wait_timeout;
-  init_options_["ge.exec.opWaitTimeout"] = op_wait_timeout;
-  init_options_["op_execute_timeout"] = op_execute_timeout;
-  init_options_["ge.exec.opExecuteTimeout"] = op_execute_timeout;
-  init_options_["customize_dtypes"] = customize_dtypes;
-  init_options_["ge.customizeDtypes"] = customize_dtypes;
-  init_options_["op_debug_config"] = op_debug_config;
-  init_options_["ge.exec.opDebugConfig"] = op_debug_config;
-  init_options_["static_memory_policy"] = static_memory_policy;
-  // Commercial version has been released, temporarily used
-  init_options_["GE_USE_STATIC_MEMORY"] = static_memory_policy;
-  init_options_["ge.exec.staticMemoryPolicy"] = static_memory_policy;
-
-  init_options_["ge.hcomMultiMode"] = std::to_string(hcom_multi_mode);
-  init_options_[ge::MODIFY_MIXLIST] = modify_mixlist;
-  init_options_["ge.fusionSwitchFile"] = fusion_switch_file;
-  init_options_[ge::OP_PRECISION_MODE] = op_precision_mode;
-  init_options_[ge::OP_SELECT_IMPL_MODE] = op_select_implmode;
-  init_options_[ge::OPTYPELIST_FOR_IMPLMODE] = optypelist_for_implmode;
-  init_options_["ge.exec.hcclExecuteTimeOut"] = hccl_timeout;
-  init_options_["HCCL_algorithm"] = HCCL_algorithm;
-  init_options_["graph_exec_timeout"] = std::to_string(graph_exec_timeout);
-  init_options_["ge.exec.graphExecTimeout"] = std::to_string(graph_exec_timeout);
-  init_options_["logical_device_cluster_deploy_mode"] = logical_device_cluster_deploy_mode;
-  init_options_["ge.exec.logicalDeviceClusterDeployMode"] = logical_device_cluster_deploy_mode;
-  init_options_["logical_device_id"] = logical_device_id;
-  init_options_["ge.exec.logicalDeviceId"] = logical_device_id;
-  init_options_["model_deploy_mode"] = model_deploy_mode;
-  init_options_["ge.exec.modelDeployMode"] = model_deploy_mode;
-  init_options_["model_deploy_devicelist"] = model_deploy_devicelist;
-  init_options_["ge.exec.modelDeployDevicelist"] = model_deploy_devicelist;
-  init_options_["dump_data"] = dump_data;
-  init_options_["ge.exec.dumpData"] = dump_data;
-  init_options_["aoe_config_file"] = aoe_config_file;
-  init_options_["ge.aoe_config_file"] = aoe_config_file;
-  init_options_["stream_sync_timeout"] = std::to_string(stream_sync_timeout);
-  init_options_["event_sync_timeout"] = std::to_string(event_sync_timeout);
 
   pass_options["do_npu_optimizer"] = std::to_string(static_cast<int32_t>(do_npu_optimizer));
   pass_options["enable_data_pre_proc"] = std::to_string(static_cast<int32_t>(enable_dp));
@@ -2211,22 +2225,12 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   pass_options["frozen_variable"] = std::to_string(static_cast<int32_t>(frozen_variable));
   pass_options["variable_location"] = variable_location;
 
-  if (!node) {
-    ADP_LOG(ERROR) << "node is null.";
-    LOG(ERROR) << "node is null.";
-    return errors::Internal("node is null.");
-  }
-  std::string attr_name;
   for (const auto &option : sess_options) {
-    attr_name = std::string("_") + option.first;
-    node->AddAttr(attr_name, option.second);
-  }
-  for (const auto &option : init_options_) {
-    attr_name = std::string("_") + option.first;
+    std::string attr_name = std::string("_") + option.first;
     node->AddAttr(attr_name, option.second);
   }
   for (const auto &option : pass_options) {
-    attr_name = std::string("_") + option.first;
+    std::string attr_name = std::string("_") + option.first;
     node->AddAttr(attr_name, option.second);
   }
   node->AddAttr("_NpuOptimizer", "NpuOptimizer");
diff --git a/tf_adapter/util/npu_attrs.h b/tf_adapter/util/npu_attrs.h
index 7e73c693a..2f0def1ec 100644
--- a/tf_adapter/util/npu_attrs.h
+++ b/tf_adapter/util/npu_attrs.h
@@ -21,6 +21,7 @@
 #include <sstream>
 #include <algorithm>
 #include <map>
+#include <mutex>
 #include <string>
 #include "ge/ge_api_types.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
@@ -93,6 +94,7 @@ class NpuAttrs {
   static std::map<std::string, bool> use_adp_info_;
   static std::map<std::string, bool> dataset_execute_info_;
   static std::map<std::string, std::string> init_options_;
+  static std::mutex mutex_;
 };
 }  // namespace tensorflow
 
-- 
Gitee


From 023a9c15bd46ebbbd04dcb6e449e593f5259d7f2 Mon Sep 17 00:00:00 2001
From: xujiuxu <xujiuxu1@huawei.com>
Date: Thu, 30 Mar 2023 06:50:19 +0000
Subject: [PATCH 14/22] =?UTF-8?q?!2178=20=E5=91=8A=E8=AD=A6=E6=B8=85?=
 =?UTF-8?q?=E7=90=86=20Merge=20pull=20request=20!2178=20from=20xujiuxu/che?=
 =?UTF-8?q?rry-pick-1680148588?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/optimizers/frozen_variable_pass.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tf_adapter/optimizers/frozen_variable_pass.cc b/tf_adapter/optimizers/frozen_variable_pass.cc
index 02582727e..951f66ab3 100644
--- a/tf_adapter/optimizers/frozen_variable_pass.cc
+++ b/tf_adapter/optimizers/frozen_variable_pass.cc
@@ -49,10 +49,10 @@ class FrozenVariablePass : public GraphOptimizationPass {
  private:
   bool IsAllOutputsIdentity(const Node * const node) const;
   bool IsAllOutputsReadOp(const Node * const node) const;
-  bool IsNeedBuildPartitionedCall(const Node * const node);
+  bool IsNeedBuildPartitionedCall(const Node * const node) const;
   std::map<std::string, std::string> GetGraphConfigs(const Graph &graph) const;
   void RemoveDeadNodes(Graph* g) const;
-  Status DoConstantFolding(const GraphOptimizationPassOptions &options, const uint64_t index);
+  Status DoConstantFolding(const GraphOptimizationPassOptions &options, const uint64_t index) const;
 };
 
 struct StableNodeCompartor {
@@ -79,7 +79,7 @@ bool FrozenVariablePass::IsAllOutputsReadOp(const Node * const node) const {
   return true;
 }
 
-bool FrozenVariablePass::IsNeedBuildPartitionedCall(const Node * const node) {
+bool FrozenVariablePass::IsNeedBuildPartitionedCall(const Node * const node) const {
   return ((node->type_string() == "Variable" || node->type_string() == "VariableV2") && IsAllOutputsIdentity(node)) ||
          (node->type_string() == "VarHandleOp" && IsAllOutputsReadOp(node));
 }
@@ -105,7 +105,7 @@ void FrozenVariablePass::RemoveDeadNodes(Graph* g) const {
 }
 
 Status FrozenVariablePass::DoConstantFolding(const GraphOptimizationPassOptions &options,
-        const uint64_t index) {
+        const uint64_t index) const {
   ADP_LOG(INFO) << "Before do const folding " << options.session_options->config.DebugString();
   if (options.device_set == nullptr) {
     return errors::Internal("Failed to get device set to run constant folding");
-- 
Gitee


From bd4c42c4f0e17358ba5ebf047f31be6a5281e65a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=B6=9B?= <wangtao123@huawei.com>
Date: Mon, 3 Apr 2023 06:58:13 +0000
Subject: [PATCH 15/22] =?UTF-8?q?!2186=20update=20owners=20Merge=20pull=20?=
 =?UTF-8?q?request=20!2186=20from=20=E7=8E=8B=E6=B6=9B/r1.12.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 OWNERS | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/OWNERS b/OWNERS
index c122c07b1..850ab2fd0 100644
--- a/OWNERS
+++ b/OWNERS
@@ -1,14 +1,7 @@
 approvers:
-- wqtshg
-- ji_chen
+- startzgf168
+- andylhy
 - zhangfan_hq
-- lipeiyang3699
-- wangtao43
-- changhaixun
-- z00332957
-- wangxiaotian22
-- xiexianhu
-- xiaozhedeng
 reviewers:
 - xchu42
 - sheng-nan
-- 
Gitee


From 1a9d266f15b0fb0d31c768b38049c0fb11050027 Mon Sep 17 00:00:00 2001
From: yaolun <yaolun2@hisilicon.com>
Date: Tue, 4 Apr 2023 09:10:53 +0000
Subject: [PATCH 16/22] =?UTF-8?q?!2191=20[=E8=B4=A8=E9=87=8F=E6=8F=90?=
 =?UTF-8?q?=E5=8D=87]=E5=AE=89=E5=85=A8=E5=87=BD=E6=95=B0=E6=95=B4?=
 =?UTF-8?q?=E6=94=B9=20Merge=20pull=20request=20!2191=20from=20yaolun/flor?=
 =?UTF-8?q?ence=5Fc29?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/util/host_queue.cc                | 23 +++++++++++++----
 tf_adapter/util/util.cc                      | 26 +++++++++++++++++---
 tf_adapter/util/util.h                       |  2 ++
 tf_adapter_2.x/npu_device/core/npu_hdc.cpp   | 18 ++++++++------
 tf_adapter_2.x/npu_device/core/npu_utils.cpp |  9 ++++---
 tf_adapter_2.x/npu_device/core/npu_utils.h   |  2 +-
 6 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/tf_adapter/util/host_queue.cc b/tf_adapter/util/host_queue.cc
index f0fb9b833..7e082fc2a 100644
--- a/tf_adapter/util/host_queue.cc
+++ b/tf_adapter/util/host_queue.cc
@@ -216,19 +216,32 @@ Status SerializeDataItemInfo(std::vector<DataItemInfo> &items, void *&buff, cons
   }
   size_t offset = 0UL;
   for (size_t i = 0UL; i < cnt; ++i) {
-    // can not use memcpy_s here, data size may over 2G
-    // total_size is calculate by item info, could not overflow here
-    (void)memcpy(ge::ValueToPtr(ge::PtrToValue(data) + offset), &items[i].ctrl_info, sizeof(ItemInfo));
+    auto ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), sizeof(ItemInfo),
+                        &items[i].ctrl_info, sizeof(ItemInfo));
+    if (ret != EOK) {
+      (void)rtMbufFree(buff);
+      return errors::Internal("Copy item info failed, ret=", ret);
+    }
     offset += sizeof(ItemInfo);
 
     for (size_t j = 0UL; j < items[i].ctrl_info.dim_num; ++j) {
-      (void)memcpy(ge::ValueToPtr(ge::PtrToValue(data) + offset), &(items[i].dims[j]), sizeof(int64_t));
+      ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), sizeof(int64_t),
+                     &(items[i].dims[j]), sizeof(int64_t));
+      if (ret != EOK) {
+        (void)rtMbufFree(buff);
+        return errors::Internal("Copy dim info failed, ret=", ret);
+      }
       offset += sizeof(int64_t);
     }
 
     if (items[i].ctrl_info.data_len == 0UL) { continue; }
 
-    (void)memcpy(ge::ValueToPtr(ge::PtrToValue(data) + offset), items[i].data_ptr, items[i].ctrl_info.data_len);
+    auto status = LoopCopy(static_cast<char *>(ge::ValueToPtr(ge::PtrToValue(data) + offset)), (total_size - offset),
+                           static_cast<char *>(items[i].data_ptr), items[i].ctrl_info.data_len);
+    if (!status.ok()) {
+      (void)rtMbufFree(buff);
+      return status;
+    }
     offset += items[i].ctrl_info.data_len;
   }
 
diff --git a/tf_adapter/util/util.cc b/tf_adapter/util/util.cc
index 8cb0b1fc3..4fcc26321 100644
--- a/tf_adapter/util/util.cc
+++ b/tf_adapter/util/util.cc
@@ -46,9 +46,9 @@ Status GetDtStringTensorData(const Tensor &tensor, uint8_t *&data_ptr, uint64_t
     ge::StringHead *head = ge::PtrToPtr<uint8_t, ge::StringHead>(base_ptr + i * sizeof(ge::StringHead));
     head->addr = offset;
     head->len = tensor.flat<tstring>()(i).size();
-    // can not use memcpy_s here, data size may over 2G
-    // total_size is calculate by item info, could not overflow here
-    (void)memcpy(base_ptr + offset, tensor.flat<tstring>()(i).data(), head->len);
+    auto status = LoopCopy(ge::PtrToPtr<uint8_t, char>(base_ptr + offset), (buff_size - offset),
+                           const_cast<char *>(tensor.flat<tstring>()(i).data()), head->len);
+    if (!status.ok()) { return status; }
     offset += head->len;
   }
   data_ptr = buff_list.back().get();
@@ -97,6 +97,26 @@ Status MappingDtStringTensor2AclDataItem(const Tensor &tensor, acltdtDataItem *&
   return Status::OK();
 }
 
+Status LoopCopy(char *dst_ptr, size_t dst_size, char *src_ptr, size_t src_size) {
+  if (dst_size < src_size) {
+    return tensorflow::errors::Internal("Loop memory copy failed. dst_size:", dst_size, ", src_size:", src_size);
+  }
+  size_t copy_size = 0UL;
+  size_t org_src_size = src_size;
+  do {
+    size_t src_copy_size = (src_size > SECUREC_MEM_MAX_LEN) ? SECUREC_MEM_MAX_LEN : src_size;
+    if (memcpy_s(dst_ptr, src_copy_size, src_ptr, src_copy_size) != EOK) {
+      return tensorflow::errors::Internal("Loop memory copy failed , dst_size:", src_copy_size,
+                                          ", src_size:", src_copy_size);
+    }
+    copy_size += src_copy_size;
+    dst_ptr += src_copy_size;
+    src_ptr += src_copy_size;
+    src_size -= src_copy_size;
+  } while (copy_size < org_src_size);
+  return tensorflow::Status::OK();
+}
+
 bool IsWithoutNpuScope(const NodeDef &node_def) {
   if (node_def.attr().count(ATTR_VALUE_SCOPE_NAME) > 0) { return node_def.attr().at(ATTR_VALUE_SCOPE_NAME).b(); }
   return false;
diff --git a/tf_adapter/util/util.h b/tf_adapter/util/util.h
index e24910144..0390d1ad1 100644
--- a/tf_adapter/util/util.h
+++ b/tf_adapter/util/util.h
@@ -33,6 +33,8 @@ Status MappingDTStringTensor2DataItem(const Tensor &tensor, tdt::DataItem &item,
 Status MappingDtStringTensor2AclDataItem(const Tensor &tensor, acltdtDataItem *&acl_data,
                                          std::vector<std::unique_ptr<uint8_t[]>> &buff_list);
 
+Status LoopCopy(char *dst_ptr, size_t dst_size, char *src_ptr, size_t src_size);
+
 bool IsWithoutNpuScope(const NodeDef &node_def);
 bool IsWithoutNpuScope(const Node *node);
 bool IsVariableOrResourceVariable(const Node * const node);
diff --git a/tf_adapter_2.x/npu_device/core/npu_hdc.cpp b/tf_adapter_2.x/npu_device/core/npu_hdc.cpp
index 3aaf5f38b..4c321ecbe 100644
--- a/tf_adapter_2.x/npu_device/core/npu_hdc.cpp
+++ b/tf_adapter_2.x/npu_device/core/npu_hdc.cpp
@@ -26,7 +26,7 @@ constexpr size_t kParallelMemCopyThreshold = 10 * 1024 * 1024UL;
 /**
  * @brief: parallel mem copy
  */
-tensorflow::Status Copy2ContinuousMem(void *dst_ptr, void *src_ptr, const size_t src_size) {
+tensorflow::Status Copy2ContinuousMem(void *dst_ptr, const size_t dst_size, void *src_ptr, const size_t src_size) {
   if (dst_ptr == nullptr || src_ptr == nullptr) {
     return tensorflow::errors::Internal("dst_ptr or src_ptr is null before do parallel memory copy.");
   }
@@ -44,19 +44,21 @@ tensorflow::Status Copy2ContinuousMem(void *dst_ptr, void *src_ptr, const size_t
   size_t block_size = src_size / npu::kDefaultThreadNum;
   size_t remained_size = src_size % npu::kDefaultThreadNum;
   std::vector<tensorflow::Status> copy_results(npu::kDefaultThreadNum);
+  size_t dst_remain_size = dst_size;
   for (size_t i = 0UL; i < npu::kDefaultThreadNum; i++) {
     if (i == npu::kDefaultThreadNum - 1U) {
       block_size += remained_size;
     }
     auto &ret = copy_results[i];
-    std::function<void()> closure = [dst_ptr, src_ptr, block_size, &ret, &parallel_cpy_count]() {
-      ret = npu::LoopCopy(static_cast<char *>(dst_ptr), static_cast<char *>(src_ptr), block_size);
+    std::function<void()> closure = [dst_ptr, dst_remain_size, src_ptr, block_size, &ret, &parallel_cpy_count]() {
+      ret = npu::LoopCopy(static_cast<char *>(dst_ptr), dst_remain_size, static_cast<char *>(src_ptr), block_size);
       ++parallel_cpy_count;
     };
     NPU_REQUIRES_OK(npu::NpuThreadPool::GetInstance().EnqueueTask(closure));
     enqueue_count++;
     dst_ptr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(dst_ptr) + block_size);
     src_ptr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(src_ptr) + block_size);
+    dst_remain_size -= block_size;
   }
   while (parallel_cpy_count < enqueue_count) {
   }
@@ -135,13 +137,14 @@ tensorflow::Status HdcChannel::AssembleAclTensor2Tensor(const acltdtDataItem *it
       tf_shape.AddDim(dim);
     }
     tensorflow::Tensor tensor = tensorflow::Tensor(tf_type, tf_shape);
-    auto tensor_data = tensor.data();
     auto tensor_size = tensor.tensor_data().size();
     if (tensor_size != acl_data_len) {
       return tensorflow::errors::Internal("Hdc channel receive size mismatch tensor size acl:", acl_data_len,
                                           " vs. tensorflow:", tensor_size);
     }
-    (void)memcpy(tensor_data, acl_data, tensor_size);
+    auto status = LoopCopy(static_cast<char *>(tensor.data()), tensor_size,
+                           const_cast<char *>(acl_data), tensor_size);
+    if (!status.ok()) { return status; }
     tensors.emplace_back(std::move(tensor));
   } else {
     return tensorflow::errors::InvalidArgument("Hdc channel receive un-copyable tensorflow data type",
@@ -260,7 +263,8 @@ tensorflow::Status HdcChannel::AssembleTensors2AclDataset(acltdtTensorType acl_t
   for (auto &tensor : tensors) {
     total_size += tensor.TotalBytes();
   }
-  tensors_buffer_.resize(std::max(tensors_buffer_.size(), total_size));
+  size_t dst_size = std::max(tensors_buffer_.size(), total_size);
+  tensors_buffer_.resize(dst_size);
 
   bool npu_alloc = NpuAllocatorUtils::IsNpuAllocator(tensors[0]);
 
@@ -275,7 +279,7 @@ tensorflow::Status HdcChannel::AssembleTensors2AclDataset(acltdtTensorType acl_t
       if (IsNeedContinuousMem() && !npu_alloc) {
         size_t src_size = tensor.TotalBytes();
         tensor_data = tensors_buffer_.data() + offset;
-        NPU_REQUIRES_OK(Copy2ContinuousMem(tensor_data, tensor.data(), src_size));
+        NPU_REQUIRES_OK(Copy2ContinuousMem(tensor_data, (dst_size - offset), tensor.data(), src_size));
         offset += src_size;
       }
       acl_data = acltdtCreateDataItem(ACL_TENSOR_DATA_TENSOR,
diff --git a/tf_adapter_2.x/npu_device/core/npu_utils.cpp b/tf_adapter_2.x/npu_device/core/npu_utils.cpp
index 7b73b7ae3..37834619e 100644
--- a/tf_adapter_2.x/npu_device/core/npu_utils.cpp
+++ b/tf_adapter_2.x/npu_device/core/npu_utils.cpp
@@ -538,14 +538,17 @@ tensorflow::Status SeparateGraphDef(tensorflow::GraphDef *def,
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status LoopCopy(char *dst_ptr, char *src_ptr, size_t src_size) {
+tensorflow::Status LoopCopy(char *dst_ptr, size_t dst_size, char *src_ptr, size_t src_size) {
+  NPU_REQUIRES((dst_size >= src_size),
+      tensorflow::errors::Internal("Loop memory copy failed. dst_size:", dst_size, ", src_size:", src_size));
+
   size_t copy_size = 0UL;
   size_t org_src_size = src_size;
   do {
     size_t src_copy_size = (src_size > SECUREC_MEM_MAX_LEN) ? SECUREC_MEM_MAX_LEN : src_size;
     if (memcpy_s(dst_ptr, src_copy_size, src_ptr, src_copy_size) != EOK) {
-      return tensorflow::errors::Internal("loop memory copy failed , dst:", dst_ptr, ", dst_size:", src_copy_size,
-                                          ", src:", src_ptr, ", src_size:", src_copy_size);
+      return tensorflow::errors::Internal("loop memory copy failed , dst_size:", src_copy_size,
+                                          ", src_size:", src_copy_size);
     }
     copy_size += src_copy_size;
     dst_ptr += src_copy_size;
diff --git a/tf_adapter_2.x/npu_device/core/npu_utils.h b/tf_adapter_2.x/npu_device/core/npu_utils.h
index 57338d4dc..41945620c 100644
--- a/tf_adapter_2.x/npu_device/core/npu_utils.h
+++ b/tf_adapter_2.x/npu_device/core/npu_utils.h
@@ -157,7 +157,7 @@ class OptimizeStageGraphDumper {
 
 void NpuCustomizedOptimizeGraph(tensorflow::FunctionLibraryRuntime &lib, std::unique_ptr<tensorflow::Graph> *g);
 
-tensorflow::Status LoopCopy(char *dst_ptr, char *src_ptr, size_t src_size);
+tensorflow::Status LoopCopy(char *dst_ptr, size_t dst_size, char *src_ptr, size_t src_size);
 
 int64_t CreateChannelCapacity(const npu::TensorPartialShapes &shapes, const npu::TensorDataTypes &types);
 
-- 
Gitee


From f058a9243afd9a9af8fe1472d658f464b7755596 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=B7=E6=AC=A2?= <leihuan1@huawei.com>
Date: Tue, 4 Apr 2023 12:27:56 +0000
Subject: [PATCH 17/22] =?UTF-8?q?!2194=20=E9=A2=84=E5=A4=84=E7=90=86H2D?=
 =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=8F=91=E9=80=81=E5=A4=B1=E8=B4=A5=E6=97=B6?=
 =?UTF-8?q?=E7=BB=93=E6=9D=9F=E8=AE=AD=E7=BB=83=20Merge=20pull=20request?=
 =?UTF-8?q?=20!2194=20from=20=E9=9B=B7=E6=AC=A2/r1.12.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 inc/external/acl/acl_base.h                          |  1 +
 .../tests/depends/ascendcl/src/ascendcl_stub.cc      |  8 ++++----
 .../tests/depends/ascendcl/src/ascendcl_stub.h       |  2 +-
 ...queue_dats_set_st.cc => host_queue_dataset_st.cc} |  0
 ...queue_dats_set_ut.cc => host_queue_dataset_ut.cc} |  0
 tf_adapter/util/acl_channel.cc                       | 12 ++++++------
 6 files changed, 12 insertions(+), 11 deletions(-)
 rename tf_adapter/tests/st/kernels/testcase/dataset/{host_queue_dats_set_st.cc => host_queue_dataset_st.cc} (100%)
 rename tf_adapter/tests/ut/kernels/testcase/dataset/{host_queue_dats_set_ut.cc => host_queue_dataset_ut.cc} (100%)

diff --git a/inc/external/acl/acl_base.h b/inc/external/acl/acl_base.h
index 8f4da06ec..7a5be38c5 100644
--- a/inc/external/acl/acl_base.h
+++ b/inc/external/acl/acl_base.h
@@ -126,6 +126,7 @@ static const int ACL_ERROR_RT_FAILURE = 500003;
 static const int ACL_ERROR_DRV_FAILURE = 500004;
 static const int ACL_ERROR_PROFILING_FAILURE = 500005;
 
+
 #define ACL_TENSOR_SHAPE_RANGE_NUM 2
 #define ACL_UNKNOWN_RANK 0xFFFFFFFFFFFFFFFE
 
diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc
index 8685e17d7..678259ccb 100644
--- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc
+++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc
@@ -234,16 +234,16 @@ aclError acltdtAddDataItem(acltdtDataset *dataset, acltdtDataItem *dataItem) {
     return ACL_SUCCESS;
 }
 
-bool gAclTdtSendTensorMock = false;
+bool g_AclTdtSendTensorMock = false;
 void setAclTdtSendTensorMockStub(const bool isDriverSuccess) {
-  gAclTdtSendTensorMock = isDriverSuccess;
+  g_AclTdtSendTensorMock = isDriverSuccess;
 }
 
 aclError acltdtSendTensor(const acltdtChannelHandle *handle,
                           const acltdtDataset *dataset,
                           int32_t timeout) {
-    if (gAclTdtSendTensorMock) {
-      return ACL_ERROR_DRV_FAILURE;
+    if (g_AclTdtSendTensorMock) {
+      return ACL_ERROR_RT_QUEUE_FULL;
     }
     if (dataset == nullptr || handle == nullptr) {
         return ACL_ERROR_INVALID_PARAM;
diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h
index a7df8034b..16c334f03 100644
--- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h
+++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h
@@ -36,7 +36,7 @@ void SetTensorDescSize(uint32_t val);
 extern bool g_loadModelStatus;
 void SetAclLoadModelFlag(bool load_status);
 
-extern bool gAclTdtSendTensorMock;
+extern bool g_AclTdtSendTensorMock;
 void setAclTdtSendTensorMockStub(const bool isSuccess);
 
 struct acltdtDataItem {
diff --git a/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc b/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dataset_st.cc
similarity index 100%
rename from tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc
rename to tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dataset_st.cc
diff --git a/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc b/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dataset_ut.cc
similarity index 100%
rename from tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc
rename to tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dataset_ut.cc
diff --git a/tf_adapter/util/acl_channel.cc b/tf_adapter/util/acl_channel.cc
index dd42b9b1b..57973f90d 100644
--- a/tf_adapter/util/acl_channel.cc
+++ b/tf_adapter/util/acl_channel.cc
@@ -239,21 +239,21 @@ Status RecvTensorByAcl(const acltdtChannelHandle *acl_handle, std::vector<Tensor
 // cases , we need to push data into dequeue to sent again.
 Status SendTensorsByAcl(const acltdtChannelHandle *acl_handle, acltdtTensorType acl_type,
                         const std::vector<Tensor> &tensors, bool &is_need_resend) {
-  std::vector<std::unique_ptr<uint8_t[]>> buff_list;
-  acltdtDataset *acl_dataset = nullptr;
   is_need_resend = false;
+  acltdtDataset *acl_dataset = nullptr;
+  std::vector<std::unique_ptr<uint8_t[]>> buff_list;
   TF_RETURN_IF_ERROR(AssembleTensors2AclDataset(acl_type, tensors, &acl_dataset, buff_list));
-  const int32_t kTimeOut = 3000;
-  auto acl_status = acltdtSendTensor(acl_handle, acl_dataset, kTimeOut);
+  const int32_t kTimeout = 3000;
+  auto acl_status = acltdtSendTensor(acl_handle, acl_dataset, kTimeout);
   TF_RETURN_IF_ERROR(DestroyAclDataset(acl_dataset));
   if (acl_status == ACL_ERROR_RT_QUEUE_FULL) {
     is_need_resend = true;
-    ADP_LOG(INFO) << "Send data ret != 0 , need send data again.";
+    ADP_LOG(INFO) << "Queue is full , try to send data again.";
     return Status::OK();
   }
   if (acl_status != ACL_ERROR_NONE) {
     std::string error_message = ge::GEGetErrorMsg();
-    LOG(ERROR) << "Failed to send data by acl, error code : "<< acl_status << std::endl
+    LOG(FATAL) << "Failed to send data by acl, error code : "<< acl_status << std::endl
                << "Error Message is " << std::endl
                << error_message;
     return errors::Internal("Acl send data failed, acl status:", acl_status);
-- 
Gitee


From 8afc5bd1190419154fc01698c4939d5ab19b5108 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BE=AF=E8=B4=BA?= <houhe@huawei.com>
Date: Thu, 6 Apr 2023 08:56:24 +0000
Subject: [PATCH 18/22] =?UTF-8?q?!2195=20cm=5Fworker=5Fsize=E9=80=82?=
 =?UTF-8?q?=E9=85=8D=20Merge=20pull=20request=20!2195=20from=20=E4=BE=AF?=
 =?UTF-8?q?=E8=B4=BA/cherry-pick-1680592727?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../python/npu_bridge/estimator/npu/npu_common.py    |  2 +-
 .../python/npu_bridge/estimator/npu/npu_optimizer.py | 12 ++++++------
 .../python/npu_bridge/estimator/npu/npu_strategy.py  |  3 ++-
 tf_adapter/python/npu_bridge/estimator/npu/util.py   |  5 ++---
 tf_adapter/python/npu_bridge/experimental/hccl.py    |  3 ++-
 5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py
index e2a5b1e97..cbf621104 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py
@@ -208,7 +208,7 @@ class NPUBasics(object):
             checkpoint_dir = os.getenv('LOCAL_CHECKPOINT_DIR', "")
 
             # cann't get rank_size from env, set to default 1
-            rank_size = os.getenv('RANK_SIZE', '1')
+            rank_size = util_lib.get_ranksize()
             if rank_size.isdigit() is False:
                 print("set rank_size to default 1")
                 rank_size = 1
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py
index 36fb6485a..7448bcdfa 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py
@@ -303,7 +303,7 @@ class NPUDistributedOptimizer(tf.train.Optimizer):
         """
         logging.debug("compute_gradients...")
         gradients = self._optimizer.compute_gradients(*args, **kwargs)
-        rank_size = os.getenv('RANK_SIZE')
+        rank_size = util.get_ranksize()
         if rank_size is None or int(rank_size) <= 1:
             return gradients
 
@@ -322,7 +322,7 @@ class NPUDistributedOptimizer(tf.train.Optimizer):
 
     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
         """Apply gradients on variables"""
-        rank_size = os.getenv('RANK_SIZE')
+        rank_size = util.get_ranksize()
         if rank_size is None or int(rank_size) <= 1:
             return self._optimizer.apply_gradients(grads_and_vars, global_step, name)
 
@@ -405,7 +405,7 @@ class KerasDistributeOptimizer(optimizer_v2.OptimizerV2):
 
         def new_get_gradient(loss, params):
             grads = old_get_gradient(loss, params)
-            rank_size = os.getenv('RANK_SIZE', '1')
+            rank_size = util.get_ranksize()
             if rank_size is None or int(rank_size) <= 1:
                 return grads
             averaged_grads = []
@@ -435,7 +435,7 @@ class KerasDistributeOptimizer(optimizer_v2.OptimizerV2):
 
     def _compute_gradients(self, loss, var_list, grad_loss=None):
         gradients = self._optimizer._compute_gradients(loss, var_list, grad_loss)
-        rank_size = os.getenv('RANK_SIZE', '1')
+        rank_size = util.get_ranksize()
         if rank_size is None or int(rank_size) <= 1:
             return gradients
         averaged_grads = []
@@ -453,7 +453,7 @@ def npu_distributed_optimizer_wrapper(optimizer):
     """
     if isinstance(optimizer, str):
         optimizer = optimizers.get(optimizer)
-    rank_size = os.getenv('RANK_SIZE')
+    rank_size = util.get_ranksize()
     if hasattr(optimizer, "compute_gradients"):
         org_compute_gradients = optimizer.compute_gradients
 
@@ -515,7 +515,7 @@ def _npu_allreduce(values, reduction="mean", fusion=1, fusion_id=-1, group="hccl
         reduction = "sum"
 
     reduced_values = []
-    size = int(os.getenv("RANK_SIZE", "1"))
+    size = int(util.get_ranksize())
     for value in values:
         if isinstance(value, tf.IndexedSlices):
             # For IndexedSlices, do two allgathers intead of an allreduce.
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_strategy.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_strategy.py
index 062a1dec7..25936eba5 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_strategy.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_strategy.py
@@ -20,6 +20,7 @@
 import os
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import one_device_strategy
+from npu_bridge.estimator.npu import util as util_lib
 
 from hccl.manage.api import get_rank_size
 from hccl.manage.api import get_rank_id
@@ -33,7 +34,7 @@ class NPUExtended(one_device_strategy.OneDeviceExtended):
 
     @property
     def _num_replicas_in_sync(self):
-        rank_size = os.getenv("RANK_SIZE", "1")
+        rank_size = util_lib.get_ranksize()
         return int(rank_size)
 
     def _experimental_distribute_dataset(self, dataset):
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/util.py b/tf_adapter/python/npu_bridge/estimator/npu/util.py
index f48f1dc2f..b59752d7c 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/util.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/util.py
@@ -232,10 +232,9 @@ def set_iteration_per_loop(sess, train_op, iterations_per_loop=1):
 
 
 def get_ranksize():
-    if os.getenv("CM_WORK_SIZE") is not None and os.getenv("RANK_SIZE") is not None:
+    if os.getenv("CM_WORKER_SIZE") is not None and os.getenv("RANK_SIZE") is not None:
         raise ValueError("RANK_SIZE and CM_WORK_SIZE cannot be configured at the same time")
-    rank_size = os.getenv('RANK_SIZE') if os.getenv(
-        "RANK_SIZE") is not None else os.getenv('CM_WORK_SIZE', '1')
+    rank_size = os.getenv('RANK_SIZE') if os.getenv("RANK_SIZE") is not None else os.getenv('CM_WORKER_SIZE', '1')
     return rank_size
 
 
diff --git a/tf_adapter/python/npu_bridge/experimental/hccl.py b/tf_adapter/python/npu_bridge/experimental/hccl.py
index 9360b26c6..e33a55a11 100644
--- a/tf_adapter/python/npu_bridge/experimental/hccl.py
+++ b/tf_adapter/python/npu_bridge/experimental/hccl.py
@@ -19,6 +19,7 @@
 
 import os
 import ctypes
+from npu_bridge.estimator.npu import util as util_lib
 
 
 hccl_graph_adp_ctypes = ctypes.CDLL('libhcom_graph_adaptor.so')
@@ -38,7 +39,7 @@ def get_actual_rank_size(group="hccl_world_group"):
 
 
 def get_user_rank_size():
-    rank_size = int(os.getenv('RANK_SIZE'))
+    rank_size = int(util_lib.get_ranksize())
     return rank_size
 
 
-- 
Gitee


From 22f0d4b75bac025ef10dd469c98b2bcd0fd273c3 Mon Sep 17 00:00:00 2001
From: yaolun <yaolun2@hisilicon.com>
Date: Sat, 8 Apr 2023 11:16:10 +0000
Subject: [PATCH 19/22] =?UTF-8?q?!2202=20HostQueue=E5=8F=91=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE=E5=BC=82=E5=B8=B8=E6=97=B6=E7=AD=89=E5=BE=85=E6=97=A5?=
 =?UTF-8?q?=E5=BF=97=E8=90=BD=E7=9B=98=20Merge=20pull=20request=20!2202=20?=
 =?UTF-8?q?from=20yaolun/florence=5Fc29?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/kernels/aicpu/host_queue_dataset_op.cc |  6 +++---
 tf_adapter/util/acl_channel.cc                    | 11 ++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc
index 070404811..14a452d9c 100644
--- a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc
+++ b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc
@@ -628,7 +628,7 @@ class HostQueueDatasetOp : public DatasetOpKernel {
       Status SendDataByAclQueue(const vector<Tensor> &args, const acltdtTensorType &data_type,
                                 const uint64_t args_total_bytes) {
         Status status = Status::OK();
-        bool is_need_resend = false;
+        bool need_resend = false;
 
         while (!finish_send_) {
           if (IsHoldDataTrans()) {
@@ -640,9 +640,9 @@ class HostQueueDatasetOp : public DatasetOpKernel {
             continue;
           }
           auto start = std::chrono::steady_clock::now();
-          status = SendTensorsByAcl(acl_handle_, data_type, args, is_need_resend);
+          status = SendTensorsByAcl(acl_handle_, data_type, args, need_resend);
           if (!status.ok()) { break; }
-          if (!is_need_resend) {
+          if (!need_resend) {
             auto end = std::chrono::steady_clock::now();
             auto elapsed_time = std::chrono::duration<double, std::micro>(end - start).count();
             RefreshDataThreadPerf(ThreadType::SEND, elapsed_time, args_total_bytes);
diff --git a/tf_adapter/util/acl_channel.cc b/tf_adapter/util/acl_channel.cc
index 57973f90d..faa01b8d7 100644
--- a/tf_adapter/util/acl_channel.cc
+++ b/tf_adapter/util/acl_channel.cc
@@ -24,6 +24,10 @@
 #include "tf_adapter/util/util.h"
 #include "ge/ge_api.h"
 namespace tensorflow {
+namespace {
+  const uint32_t kWaitingForLogRecord = 1U;
+}
+
 Status MappingTfDtypeToAcl(const tensorflow::DataType tf_type, aclDataType &acl_type) {
   const static std::map<tensorflow::DataType, aclDataType> type_mapping = {
       {DT_FLOAT, ACL_FLOAT}, {DT_HALF, ACL_FLOAT16}, {DT_INT8, ACL_INT8},
@@ -238,8 +242,8 @@ Status RecvTensorByAcl(const acltdtChannelHandle *acl_handle, std::vector<Tensor
 // because they cannot return no evnet code , only empty). The above 2
 // cases , we need to push data into dequeue to sent again.
 Status SendTensorsByAcl(const acltdtChannelHandle *acl_handle, acltdtTensorType acl_type,
-                        const std::vector<Tensor> &tensors, bool &is_need_resend) {
-  is_need_resend = false;
+                        const std::vector<Tensor> &tensors, bool &need_resend) {
+  need_resend = false;
   acltdtDataset *acl_dataset = nullptr;
   std::vector<std::unique_ptr<uint8_t[]>> buff_list;
   TF_RETURN_IF_ERROR(AssembleTensors2AclDataset(acl_type, tensors, &acl_dataset, buff_list));
@@ -247,11 +251,12 @@ Status SendTensorsByAcl(const acltdtChannelHandle *acl_handle, acltdtTensorType
   auto acl_status = acltdtSendTensor(acl_handle, acl_dataset, kTimeout);
   TF_RETURN_IF_ERROR(DestroyAclDataset(acl_dataset));
   if (acl_status == ACL_ERROR_RT_QUEUE_FULL) {
-    is_need_resend = true;
+    need_resend = true;
     ADP_LOG(INFO) << "Queue is full , try to send data again.";
     return Status::OK();
   }
   if (acl_status != ACL_ERROR_NONE) {
+    sleep(kWaitingForLogRecord);
     std::string error_message = ge::GEGetErrorMsg();
     LOG(FATAL) << "Failed to send data by acl, error code : "<< acl_status << std::endl
                << "Error Message is " << std::endl
-- 
Gitee


From cf65d2f3370930e6e92995f40fb2fc8c7fd9320b Mon Sep 17 00:00:00 2001
From: yaolun <yaolun2@hisilicon.com>
Date: Mon, 10 Apr 2023 01:44:39 +0000
Subject: [PATCH 20/22] =?UTF-8?q?!2204=20=E5=91=8A=E8=AD=A6=E6=B8=85?=
 =?UTF-8?q?=E7=90=86=20Merge=20pull=20request=20!2204=20from=20yaolun/flor?=
 =?UTF-8?q?ence=5Fc29?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/util/acl_channel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf_adapter/util/acl_channel.h b/tf_adapter/util/acl_channel.h
index b1a1ac4d4..53b3ab126 100644
--- a/tf_adapter/util/acl_channel.h
+++ b/tf_adapter/util/acl_channel.h
@@ -42,7 +42,7 @@ Status DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item = tr
 Status RecvTensorByAcl(const acltdtChannelHandle *acl_handle, std::vector<Tensor> &tensors);
 
 Status SendTensorsByAcl(const acltdtChannelHandle *acl_handle, acltdtTensorType acl_type,
-                        const std::vector<Tensor> &tensors, bool &is_need_resend);
+                        const std::vector<Tensor> &tensors, bool &need_resend);
 
 Status StopRecvTensorByAcl(acltdtChannelHandle **handle, const std::string &channel_name);
 
-- 
Gitee


From 1b11ff2a2b10d2d79f59270acdecafe4329210b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=B7=E6=AC=A2?= <leihuan1@huawei.com>
Date: Thu, 20 Apr 2023 00:47:12 +0000
Subject: [PATCH 21/22] =?UTF-8?q?!2224=20=E5=90=8C=E6=AD=A5DTS202304130460?=
 =?UTF-8?q?7=E4=BF=AE=E6=94=B9=E5=88=B0C29=E5=88=86=E6=94=AF=20Merge=20pul?=
 =?UTF-8?q?l=20request=20!2224=20from=20=E9=9B=B7=E6=AC=A2/r1.12.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/kernels/geop_npu.cc                | 50 +++++++++++++++----
 tf_adapter/kernels/geop_npu.h                 |  5 +-
 .../depends/ascendcl/src/ascendcl_stub.cc     |  2 +
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc
index fd5d3a630..19aa69124 100644
--- a/tf_adapter/kernels/geop_npu.cc
+++ b/tf_adapter/kernels/geop_npu.cc
@@ -291,7 +291,7 @@ GeOp::GeOp(OpKernelConstruction *ctx)
       compute_graph_empty_(false), is_input_convert_(false), data_format_(""), graph_id_(0),
       is_initialized_graph_(false), need_iteration_(false), tf_session_(""), ge_session_(nullptr), job_type_(""),
       is_host_graph_(false), handle_(nullptr), need_compile_graph_first_(false), tuned_flag_(ATOMIC_FLAG_INIT),
-      jit_compile_(false), is_getnext_dynamic_shape_(false), session_id_(0), aoe_initialize_(nullptr),
+      jit_compile_(""), is_dynamic_input_(false), session_id_(0), aoe_initialize_(nullptr),
       aoe_finalize_(nullptr), aoe_create_session_(nullptr), aoe_destroy_session_(nullptr), aoe_set_gesession_(nullptr),
       aoe_set_dependgraphs_(nullptr), aoe_set_tuninggraph_(nullptr), aoe_tuning_graph_(nullptr),
       aoe_set_depend_graphs_inputs_(nullptr), aoe_set_tuning_graph_input_(nullptr) {
@@ -329,7 +329,7 @@ void GeOp::Initialize(OpKernelConstruction *ctx) {
   ctx->GetAttr("_dynamic_input", &dynamic_input_);
   if (!dynamic_input_.empty() && dynamic_input_ == "1") {
     jit_compile_ = true;
-    is_getnext_dynamic_shape_ = true;
+    is_dynamic_input_ = true;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("_dynamic_graph_execute_mode", &dynamic_graph_execute_mode_));
     ctx->GetAttr("_getnext_inputs_shape_range", &getnext_inputs_shape_range_);
     ctx->GetAttr("_data_inputs_shape_range", &data_inputs_shape_range_);
@@ -341,7 +341,7 @@ void GeOp::Initialize(OpKernelConstruction *ctx) {
   ADP_LOG(INFO) << "[GEOP] dynamic_input: " << dynamic_input_
                 << ", dynamic_graph_execute_mode: " << dynamic_graph_execute_mode_
                 << ", jit_compile: " << jit_compile_
-                << ", is_getnext_dynamic_shape_: " << is_getnext_dynamic_shape_
+                << ", is_dynamic_input: " << is_dynamic_input_
                 << ", getnext_inputs_shape_range: " << getnext_inputs_shape_range_
                 << ", data_inputs_shape_range: " << data_inputs_shape_range_ << ", is_train_graph: " << is_train_graph_
                 << ", is_dynamic_getnext: " << is_dynamic_getnext_ << ", placeholder_index: " << placeholder_index_
@@ -1106,6 +1106,18 @@ void GeOp::BuildQueueDataAndGetNextFromQueue(Graph &graph, const Node &getnext_n
   get_next_node_def.mutable_attr()->insert({"op_def", get_next_attr});
 }
 
+bool GeOp::IsDynamicGetNext(const Node *node) {
+  if (is_dynamic_input_) {
+    return true;
+  }
+  auto it = is_getnext_dynamic_shape_.find(node->name());
+  if (it == is_getnext_dynamic_shape_.end()) {
+    return false;
+  } else {
+    return it->second;
+  }
+}
+
 void GeOp::HandleDpOpAndGetNextNodes(Graph &graph) {
   std::vector<Node *> remove_nodes;
   for (Node *node : graph.nodes()) {
@@ -1143,14 +1155,14 @@ void GeOp::HandleDpOpAndGetNextNodes(Graph &graph) {
           remove_nodes.push_back(iterator_node);
         }
       } else if (NpuAttrs::IsDatasetExecuteInDevice(tf_session_ + iterator_name)) {
-        if (is_getnext_dynamic_shape_) {
+        if (IsDynamicGetNext(node)) {
           node_def.set_op("DynamicGetNext");
         }
       } else {
         Node *aicpu_getnext = nullptr;
         std::string aicpu_getnext_name = "aicpu_getnext_" + node->name();
         auto getnext_attrs = node->def().attr();
-        std::string aicpu_getnext_type = is_getnext_dynamic_shape_ ? "DynamicGetNextV2" : "GetNext";
+        std::string aicpu_getnext_type = IsDynamicGetNext(node) ? "DynamicGetNextV2" : "GetNext";
         TF_CHECK_OK(NodeBuilder(aicpu_getnext_name, aicpu_getnext_type)
                         .Device(node->def().device())
                         .Attr("channel_name", channel_name)
@@ -1219,18 +1231,38 @@ Status GeOp::ProcessForDiffNodeTypes(Graph &graph, bool &is_initialize, bool &is
 }
 
 void GeOp::ProcessGetNextNode(const Node *node) {
+  bool is_dynamic_shape = false;
+  const char *kTypeAttrName = "output_types";
+  const char *kShapeAttrName = "output_shapes";
+  std::vector<DataType> type_attrs;
   std::vector<const TensorShapeProto *> shape_attrs;
-  const char *kAttrName = "output_shapes";
-  if (tensorflow::TryGetNodeAttr(node->attrs(), kAttrName, &shape_attrs)) {
+  if (tensorflow::TryGetNodeAttr(node->attrs(), kShapeAttrName, &shape_attrs)) {
     for (auto i = 0; i < node->num_outputs(); i++) {
       const TensorShapeProto &shape_proto = *shape_attrs[i];
       tensorflow::PartialTensorShape shape(shape_proto);
       if (!shape.IsFullyDefined()) {
-        is_getnext_dynamic_shape_ = true;
-        ADP_LOG(INFO) << "[GEOP]node: " + node->name() + " is_getnext_dynamic_shape_ come true.";
+        jit_compile_ = "0";
+        is_dynamic_shape = true;
+        ADP_LOG(INFO) << "[GEOP]node: " + node->name() + " is_dynamic_shape come true.";
       }
     }
   }
+  if (is_dynamic_shape == false &&
+      tensorflow::TryGetNodeAttr(node->attrs(), kTypeAttrName, &type_attrs)) {
+    for (auto i = 0; i < node->num_outputs(); i++) {
+      if (DT_STRING == type_attrs[i]) {
+        jit_compile_ = "0";
+        is_dynamic_shape = true;
+        ADP_LOG(INFO) << "[GEOP]node: " + node->name() + "'s output_types include DT_STRING.";
+      }
+    }
+  }
+  auto it = is_getnext_dynamic_shape_.find(node->name());
+  if (it == is_getnext_dynamic_shape_.end()) {
+    (void)is_getnext_dynamic_shape_.insert(std::make_pair(node->name(), is_dynamic_shape));
+  } else {
+    ADP_LOG(WARNING) << "[GEOP]node: " + node->name() + " has is_dynamic_shape[" << it->second << "].";
+  }
 }
 
 void GeOp::UpdateInputsShapeDesc(Graph &graph) {
diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h
index f2b970c1b..c714b2e45 100644
--- a/tf_adapter/kernels/geop_npu.h
+++ b/tf_adapter/kernels/geop_npu.h
@@ -126,6 +126,8 @@ private:
 
   void HandleDpOpAndGetNextNodes(Graph &graph);
 
+  bool IsDynamicGetNext(const Node *node);
+
   void ChangeChannelNameAttr(NodeDef &node_def) const;
 
   bool IsDynamicConfig();
@@ -193,7 +195,8 @@ private:
   std::string recompute_mode_;
   std::vector<absl::optional<PartialTensorShape>> input_shapes_vec_;
   bool jit_compile_;
-  bool is_getnext_dynamic_shape_;
+  bool is_dynamic_input_;
+  std::map<std::string, bool> is_getnext_dynamic_shape_;
   SessionId session_id_;
   AoeInitializeFunc aoe_initialize_;
   AoeFinalizeFunc aoe_finalize_;
diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc
index 678259ccb..63278135f 100644
--- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc
+++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc
@@ -243,6 +243,8 @@ aclError acltdtSendTensor(const acltdtChannelHandle *handle,
                           const acltdtDataset *dataset,
                           int32_t timeout) {
     if (g_AclTdtSendTensorMock) {
+      // 这里保证ACL_ERROR_RT_QUEUE_FULL只返回一次，否则会导致日志持续刷屏
+      g_AclTdtSendTensorMock = false;
       return ACL_ERROR_RT_QUEUE_FULL;
     }
     if (dataset == nullptr || handle == nullptr) {
-- 
Gitee


From 4e109f00f31ccb90d71e3a60c877fc0bf2c76a24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=B7=E6=AC=A2?= <leihuan1@huawei.com>
Date: Fri, 21 Apr 2023 15:48:07 +0800
Subject: [PATCH 22/22] =?UTF-8?q?=E5=9B=9E=E9=80=80=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/kernels/geop_npu.cc | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc
index 19aa69124..e9e9dba51 100644
--- a/tf_adapter/kernels/geop_npu.cc
+++ b/tf_adapter/kernels/geop_npu.cc
@@ -1247,16 +1247,6 @@ void GeOp::ProcessGetNextNode(const Node *node) {
       }
     }
   }
-  if (is_dynamic_shape == false &&
-      tensorflow::TryGetNodeAttr(node->attrs(), kTypeAttrName, &type_attrs)) {
-    for (auto i = 0; i < node->num_outputs(); i++) {
-      if (DT_STRING == type_attrs[i]) {
-        jit_compile_ = "0";
-        is_dynamic_shape = true;
-        ADP_LOG(INFO) << "[GEOP]node: " + node->name() + "'s output_types include DT_STRING.";
-      }
-    }
-  }
   auto it = is_getnext_dynamic_shape_.find(node->name());
   if (it == is_getnext_dynamic_shape_.end()) {
     (void)is_getnext_dynamic_shape_.insert(std::make_pair(node->name(), is_dynamic_shape));
-- 
Gitee