From f8f35673af34c25564755387d41da6934c35bc69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=BC=BA?= Date: Wed, 15 Mar 2023 09:39:41 +0000 Subject: [PATCH 01/22] =?UTF-8?q?!2106=20OBP=20=E5=95=86=E5=88=86=E5=88=A0?= =?UTF-8?q?=E9=99=A4embedding=20Merge=20pull=20request=20!2106=20from=20?= =?UTF-8?q?=E5=88=98=E5=BC=BA/lq=5F315?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/interface_spec/api_npu_config.pyh | 2 +- tf_adapter/kernels/geop_npu.cc | 20 +- tf_adapter/kernels/geop_npu.h | 4 - .../optimizers/om_partition_subgraphs_pass.cc | 19 - .../python/npu_bridge/embedding/__init__.py | 23 -- .../embedding/embedding_optimizer.py | 158 -------- .../embedding/embedding_resource.py | 39 -- .../npu_bridge/embedding/embedding_service.py | 363 ------------------ .../embedding/embedding_table_map_policy.py | 119 ------ .../python/npu_bridge/embedding/tf_path.py | 64 --- .../npu_bridge/estimator/npu/npu_config.py | 3 - .../npu_bridge/estimator/npu/npu_estimator.py | 2 - .../python/npu_bridge/npu_cpu/npu_cpu_ops.py | 99 ----- tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt | 18 - .../testcase/get_attr_optimize_pass_test.cc | 3 - tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt | 24 -- .../testcase/get_attr_optimize_pass_test.cc | 3 - tf_adapter/util/npu_attrs.cc | 15 - 18 files changed, 2 insertions(+), 976 deletions(-) delete mode 100644 tf_adapter/python/npu_bridge/embedding/__init__.py delete mode 100644 tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py delete mode 100644 tf_adapter/python/npu_bridge/embedding/embedding_resource.py delete mode 100644 tf_adapter/python/npu_bridge/embedding/embedding_service.py delete mode 100644 tf_adapter/python/npu_bridge/embedding/embedding_table_map_policy.py delete mode 100644 tf_adapter/python/npu_bridge/embedding/tf_path.py diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh index c89bc3c6b..718db2ac6 100644 --- a/tf_adapter/interface_spec/api_npu_config.pyh +++ b/tf_adapter/interface_spec/api_npu_config.pyh @@ -19,7 +19,7 @@ class NPURunConfig(run_config_lib.RunConfig): soc_config=None, hccl_timeout=None, op_wait_timeout=None, op_execute_timeout=None, HCCL_algorithm=None, customize_dtypes=None, op_debug_config=None, memory_config=None, experimental_config=None, topo_sorting_mode=None, aoe_config_file=None, insert_op_file=None, stream_sync_timeout=-1, - event_sync_timeout=-1, external_weight=False, es_cluster_config=None, deterministic=0, + event_sync_timeout=-1, external_weight=False, deterministic=0, frozen_variable=False, variable_placement="Device"): class ProfilingConfig(): diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index e2fdb4301..fc8538f00 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -326,10 +326,6 @@ void GeOp::Initialize(OpKernelConstruction *ctx) { } ctx->GetAttr("_recompute_mode", &recompute_mode_); - ctx->GetAttr("_deploy_inject_config", &deploy_inject_config_); - ctx->GetAttr("_execute_times", &execute_times_); - ctx->GetAttr("_max_num", &max_num_); - ctx->GetAttr("_embedding_dim", &embedding_dim_); ctx->GetAttr("_dynamic_input", &dynamic_input_); if (!dynamic_input_.empty() && dynamic_input_ == "1") { jit_compile_ = true; @@ -349,9 +345,7 @@ void GeOp::Initialize(OpKernelConstruction *ctx) { << ", getnext_inputs_shape_range: " << getnext_inputs_shape_range_ << ", data_inputs_shape_range: " << data_inputs_shape_range_ << ", is_train_graph: " << is_train_graph_ << ", is_dynamic_getnext: " << is_dynamic_getnext_ << ", placeholder_index: " << placeholder_index_ - << ", is_var_init_graph: " << is_var_init_graph_ << ", deploy_inject_config: " << deploy_inject_config_ - << ", execute_times: " << execute_times_ << ", max_num: " << max_num_ - << ", embedding_dim: " << embedding_dim_; + << ", is_var_init_graph: " << is_var_init_graph_; // global environment Initialize, invoke once for each process std::string sess_config = ""; @@ -865,18 +859,6 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { if (!recompute_mode_.empty()) { graph_options_["ge.recompute"] = recompute_mode_; } - if (!deploy_inject_config_.empty()) { - graph_options_["ge.exec.clusterSpec"] = deploy_inject_config_; - } - if (!execute_times_.empty()) { - graph_options_["ge.execute_times"] = execute_times_; - } - if (!max_num_.empty()) { - graph_options_["ge.max_num"] = max_num_; - } - if (!embedding_dim_.empty()) { - graph_options_["ge.embedding_dim"] = embedding_dim_; - } SetDynamicInput(); graph_options_["ge.exec.isVarInitGraph"] = is_var_init_graph_; graph_options_["ge.jit_compile"] = jit_compile_ ? "1" : "0"; diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h index 93a468146..f2b970c1b 100644 --- a/tf_adapter/kernels/geop_npu.h +++ b/tf_adapter/kernels/geop_npu.h @@ -190,10 +190,6 @@ private: std::atomic_flag tuned_flag_; std::vector> remove_index_; std::string is_var_init_graph_; - std::string deploy_inject_config_; - std::string execute_times_; - std::string max_num_; - std::string embedding_dim_; std::string recompute_mode_; std::vector> input_shapes_vec_; bool jit_compile_; diff --git a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc index 13588e8c9..bb54e52d8 100644 --- a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc +++ b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc @@ -2000,10 +2000,6 @@ void OMPartitionSubgraphsPass::GetGraphConfig(const Node &node, bool enable_dp, const std::string kDynamicInputsShapeRange = "_graph_dynamic_inputs_shape_range"; const std::string kIsTrainGraph = "_is_train_graph"; const std::string kRecomputeMode = "_recompute_mode"; - const std::string kDeployInjectConfig = "_deploy_inject_config"; - const std::string kExecuteTimes = "_execute_times"; - const std::string kMaxNum = "_max_num"; - const std::string kEmbeddingDim = "_embedding_dim"; if (node_attrs.find(kDynamicInput) != node_attrs.end()) { bool dynamic_input = node_attrs.at(kDynamicInput).b(); graph_options["dynamic_input"] = std::to_string(static_cast(dynamic_input)); @@ -2024,21 +2020,6 @@ void OMPartitionSubgraphsPass::GetGraphConfig(const Node &node, bool enable_dp, std::string recompute_mode = node_attrs.at(kRecomputeMode).s(); graph_options["recompute_mode"] = recompute_mode; } - if (node_attrs.find(kDeployInjectConfig) != node_attrs.end()) { - graph_options["deploy_inject_config"] = node_attrs.at(kDeployInjectConfig).s(); - } - if (node_attrs.find(kExecuteTimes) != node_attrs.end()) { - const auto execute_times = node_attrs.at(kExecuteTimes).i(); - graph_options["execute_times"] = std::to_string(static_cast(execute_times)); - } - if (node_attrs.find(kMaxNum) != node_attrs.end()) { - const auto max_num = node_attrs.at(kMaxNum).i(); - graph_options["max_num"] = std::to_string(static_cast(max_num)); - } - if (node_attrs.find(kEmbeddingDim) != node_attrs.end()) { - const auto embedding_dim = node_attrs.at(kEmbeddingDim).i(); - graph_options["embedding_dim"] = std::to_string(static_cast(embedding_dim)); - } } Status OMPartitionSubgraphsPass::ProcessGetNext(Node &node, const std::string enable_dp, diff --git a/tf_adapter/python/npu_bridge/embedding/__init__.py b/tf_adapter/python/npu_bridge/embedding/__init__.py deleted file mode 100644 index f4cdb646b..000000000 --- a/tf_adapter/python/npu_bridge/embedding/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - - -from npu_bridge.embedding.embedding_optimizer import AdamOptimizer as EmbeddingAdamOptimizer -from npu_bridge.embedding.embedding_optimizer import AdagradOptimizer as EmbeddingAdagradOptimizer -from npu_bridge.embedding.embedding_service import ESWorker as EmbeddingService -from npu_bridge.embedding.tf_path import path_on_tf -path_on_tf() \ No newline at end of file diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py b/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py deleted file mode 100644 index 0d596fed7..000000000 --- a/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from tensorflow.python.framework import ops -from tensorflow.python.eager import context -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import control_flow_ops -from tensorflow.python.training import adam -from tensorflow.python.training import adagrad -from tensorflow.python.training import training_ops -from tensorflow.python.training import training_util -from npu_bridge.embedding.embedding_resource import NpuEmbeddingResource -from npu_bridge.npu_cpu.npu_cpu_ops import gen_npu_cpu_ops - -_GLOBAL_STEP_VALUE = 1 - - -class AdamOptimizer(adam.AdamOptimizer): - @property - def embedding_dims(self): - return self._embedding_dims - - @embedding_dims.setter - def embedding_dims(self, val): - self._embedding_dims = val - - def _prepare(self): - lr = self._call_if_callable(self._lr) - epsilon = self._call_if_callable(self._epsilon) - self._beta1_t_list = [] - self._beta2_t_list = [] - self._lr_t = ops.convert_to_tensor(lr, name="learning_rate") - self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon") - - def _resource_apply_sparse(self, grad, var, indices): - if isinstance(var, NpuEmbeddingResource): - beta1 = self._call_if_callable(self._beta1) - beta2 = self._call_if_callable(self._beta2) - self._beta1_t = ops.convert_to_tensor(beta1, name="beta1" + str(self.table_idx)) - self._beta2_t = ops.convert_to_tensor(beta2, name="beta2" + str(self.table_idx)) - self._beta1_t_list.append(self._beta1_t) - self._beta2_t_list.append(self._beta2_t) - beta1_power, beta2_power = self._get_beta_accumulators() - self.table_idx += 1 - return gen_npu_cpu_ops.embedding_apply_adam(var.handle, beta1_power, beta2_power, - math_ops.cast(self._lr_t, grad.dtype), - math_ops.cast(self._beta1_t, grad.dtype), - math_ops.cast(self._beta2_t, grad.dtype), - math_ops.cast(self._epsilon_t, grad.dtype), - grad, - indices, - ops.convert_to_tensor(_GLOBAL_STEP_VALUE), - self._embedding_dims) - else: - return self._apply_sparse_shared(grad, var, indices, self._resource_scatter_add) - - def _create_slots(self, var_list): - self.table_num = 0 - self.table_idx = 0 - first_var = min(var_list, key=lambda x: x.name) - for idx in range(len(var_list)): - self._create_non_slot_variable( - initial_value=self._beta1, name="beta1_power" + str(idx), colocate_with=first_var) - self._create_non_slot_variable( - initial_value=self._beta2, name="beta2_power" + str(idx), colocate_with=first_var) - self.table_num += 1 - - for v in var_list: - if not isinstance(v, NpuEmbeddingResource): - self._zeros_slot(v, "m", self._name) - self._zeros_slot(v, "v", self._name) - - def _get_beta_accumulators(self): - with ops.init_scope(): - if context.executing_eagerly(): - graph = None - else: - graph = ops.get_default_graph() - return (self._get_non_slot_variable("beta1_power" + str(self.table_idx), graph=graph), - self._get_non_slot_variable("beta2_power" + str(self.table_idx), graph=graph)) - - def _finish(self, update_ops, name_scope): - # Update the power accumulators. - self.table_num = 0 - self.table_idx = 0 - finish_output = [] - with ops.control_dependencies(update_ops): - beta1_power_list = [] - beta2_power_list = [] - for k in update_ops: - beta1_power, beta2_power = self._get_beta_accumulators() - beta1_power_list.append(beta1_power) - beta2_power_list.append(beta2_power) - self.table_idx += 1 - for idx in range(len(update_ops)): - beta1_power = beta1_power_list[idx] - beta2_power = beta2_power_list[idx] - with ops.colocate_with(beta1_power): - update_beta1 = beta1_power.assign( - beta1_power * self._beta1_t_list[idx], use_locking=self._use_locking) - update_beta2 = beta2_power.assign( - beta2_power * self._beta2_t_list[idx], use_locking=self._use_locking) - new_update_op = [] - new_update_op.append(update_ops[idx]) - finish_output.append(control_flow_ops.group( - *new_update_op + [update_beta1, update_beta2], name=name_scope + str(idx))) - return finish_output - - -class AdagradOptimizer(adagrad.AdagradOptimizer): - @property - def embedding_dims(self): - return self._embedding_dims - - @embedding_dims.setter - def embedding_dims(self, val): - self._embedding_dims = val - - def _resource_apply_sparse(self, grad, var, indices): - if isinstance(var, NpuEmbeddingResource): - return gen_npu_cpu_ops.embedding_apply_ada_grad(var.handle, - math_ops.cast(self._learning_rate_tensor, grad.dtype), - grad, - indices, - ops.convert_to_tensor(_GLOBAL_STEP_VALUE), - self._embedding_dims) - else: - return self.training_ops.resource_sparse_apply_adagrad(var.handle, grad.handle, - math_ops.cast(self._learning_rate_tensor, - grad.dtype), - grad, indices, - use_locking=self._use_locking) - - def _create_slots(self, var_list): - for v in var_list: - if not isinstance(v, NpuEmbeddingResource): - dtype = v.dtype.base_dtype - if v.get_shape().is_fully_defined(): - init = init_ops.constant_initializer(self._initial_accumulator_value, - dtype=dtype) - else: - init = self._init_constant_op(v, dtype) - self._get_or_make_slot_with_initializer(v, init, v.get_shape(), dtype, - "accumulator", self._name) diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_resource.py b/tf_adapter/python/npu_bridge/embedding/embedding_resource.py deleted file mode 100644 index fdfc288bc..000000000 --- a/tf_adapter/python/npu_bridge/embedding/embedding_resource.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from tensorflow.python.framework import ops -from npu_bridge.npu_cpu.npu_cpu_ops import gen_npu_cpu_ops - - -class NpuEmbeddingResource: - - def __init__(self, table_id): - self.name = table_id - self._tensor = gen_npu_cpu_ops.table_to_resource(ops.convert_to_tensor(table_id)) - - @property - def handle(self): - return self._tensor - - @property - def graph(self): - return self._tensor.graph - - @property - def op(self): - return self._tensor.op - diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_service.py b/tf_adapter/python/npu_bridge/embedding/embedding_service.py deleted file mode 100644 index a661e11c9..000000000 --- a/tf_adapter/python/npu_bridge/embedding/embedding_service.py +++ /dev/null @@ -1,363 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import json -import contextlib -import os -import math -import tensorflow as tf -from tensorflow.python.framework import ops -from tensorflow.python.framework import random_seed -from tensorflow.core.framework import attr_value_pb2 -from npu_bridge.npu_cpu.npu_cpu_ops import gen_npu_cpu_ops -from npu_bridge.embedding.embedding_resource import NpuEmbeddingResource -from npu_bridge.embedding import embedding_optimizer -from npu_bridge.embedding.embedding_table_map_policy import NoneTableMapPolicy, AutoMergeTableMapPolicy - -_INT32_MAX_VALUE = 2147483647 - - -@contextlib.contextmanager -def specified_ps_engine_scope(): - """ - Enable the non npu compilation of operators within the scope. - """ - attrs = { - "_process_node_engine_id": attr_value_pb2.AttrValue(s=tf.compat.as_bytes("PS")) - } - with ops.get_default_graph()._attr_scope(attrs): - yield - - -class ESWorker: - """ Embedding service class. """ - - def __init__(self, config_from_param=None): - env_dist = os.environ - cluster_config_from_env = env_dist.get("ESCLUSTER_CONFIG_PATH") - if cluster_config_from_env is None: - if config_from_param is None: - raise ValueError("EsClusterConfig and env variable are both null.") - es_cluster_config = config_from_param - else: - es_cluster_config = cluster_config_from_env - with open(es_cluster_config, encoding='utf-8') as a: - es_cluster_config_json = json.load(a) - self._es_cluster_conf = json.dumps(es_cluster_config_json) - self._ps_num = int(es_cluster_config_json["psNum"]) - self._embedding_dim = -1 - self._max_num = -1 - self._ps_ids = [] - self._ps_ids_list = es_cluster_config_json["psCluster"] - self._init_embedding_hash_maps = {} - self._init_partition_maps = {} - self._table_to_embedding_dim = {} - for each_ps in self._ps_ids_list: - self._ps_ids.append(each_ps["id"]) - self._train_mode = True - self._train_level = False - self._optimizer = None - self.slot_vars_num = None - self._initializer = None - self._init_flag = False - self._table_has_init = [] - self.user_defined_table_infos = [] - self.table_map_policy = None - self.table_create_infos = [] - self.total_variable_table = [] - self.total_embedding_count = 0 - config = tf.ConfigProto() - custom_op = config.graph_options.rewrite_options.custom_optimizers.add() - custom_op.name = "NpuOptimizer" - custom_op.parameter_map["es_cluster_config"].s = tf.compat.as_bytes(self._es_cluster_conf) - self.es_all_config = config - - # 提供embedding init功能 - # @param vocabulary_size int 类型 - # @param file_path string 类型 - # @param file_name string 类型 - # @param table_id int32 类型 - # @param max_batch_size int32 类型 - # @param optimizer 类型 - # @param initializer string 类型 - # @param embedding_dim int32 类型 - # @param only_var bool 类型 - # @param mode string 类型 - # @param partition_num int 类型 - def embedding_init(self, vocabulary_size, file_path, file_name, table_id, max_batch_size, optimizer=None, - initializer=None, embedding_dim=-1, only_var=False, mode="bin", partition_num=65537): - """ Operator for init embedding table. """ - if vocabulary_size is None or table_id is None or max_batch_size is None or embedding_dim is None: - raise ValueError("vocabulary_size or table_id or max_batch_size or embedding_dim is None.") - if (not isinstance(vocabulary_size, int)) or (not isinstance(table_id, int)) or \ - (not isinstance(max_batch_size, int)) or (not isinstance(embedding_dim, int)): - raise ValueError("vocabulary_size, table_id, max_batch_size and embedding_dim must be int.") - if vocabulary_size < 0 or table_id < 0: - raise ValueError("vocabulary_size and table_id can not be smaller than zero.") - if vocabulary_size >= _INT32_MAX_VALUE or table_id >= _INT32_MAX_VALUE: - raise ValueError("vocabulary_size or table_id exceed int32 max value.") - if embedding_dim <= 0 or partition_num <= 0 or max_batch_size <= 0: - raise ValueError("embedding_dim, partition_num and max_batch_size must be greater than zero.") - if table_id in self._table_has_init: - raise ValueError("this table has already initialized.") - self._embedding_dim = embedding_dim - self._max_num = max_batch_size - self._table_to_embedding_dim[table_id] = embedding_dim - self._initializer = initializer - self._table_has_init.append(table_id) - bucket_size = math.ceil(vocabulary_size / self._ps_num) - if optimizer is None: - if file_path is None or file_name is None or (not tf.gfile.Exists(os.path.join(file_path, file_name))): - raise ValueError("embedding table file not exist.") - self._train_mode = False - self.slot_vars_num = 0 - else: - if (not isinstance(optimizer, embedding_optimizer.AdamOptimizer) and - not isinstance(optimizer, embedding_optimizer.AdagradOptimizer)): - raise ValueError( - "optimizer should be embedding_optimizer.AdamOptimizer or embedding_optimizer.AdagradOptimizer") - if (initializer is not None) and (initializer is not 'random_uniform') and \ - (initializer is not 'truncated_normal'): - raise ValueError("initializer must be random_uniform or truncated_normal.") - self._optimizer = optimizer - self._optimizer._embedding_dims = embedding_dim - # adam include m and v, 2 slots; adagrad include accumulator, 1 slot - self.slot_vars_num = 2 if isinstance(self._optimizer, embedding_optimizer.AdamOptimizer) else 1 - if (file_path is None) or (file_name is None) or (not tf.gfile.Exists(os.path.join(file_path, file_name))): - if initializer is None: - raise ValueError("In new embedding training, initializer can not be None.") - self._train_level = True - with specified_ps_engine_scope(): - self._init_partition_maps[table_id] = \ - gen_npu_cpu_ops.init_partition_map(ps_num=ops.convert_to_tensor(self._ps_num), - ps_ids=ops.convert_to_tensor(self._ps_ids), - partition_num=partition_num) - self._init_partition_maps.get(table_id)._set_attr("_execute_times", attr_value_pb2.AttrValue(i=1)) - self._init_partition_maps.get(table_id)._set_attr("_embedding_dim", - attr_value_pb2.AttrValue(i=self._embedding_dim)) - self._init_partition_maps.get(table_id)._set_attr("_max_num", attr_value_pb2.AttrValue(i=self._max_num)) - self._init_partition_maps.get(table_id)._set_attr("_deploy_inject_config", - attr_value_pb2.AttrValue( - s=tf.compat.as_bytes(self._es_cluster_conf))) - return self._init_hashmap_and_table_import(bucket_size, file_path, file_name, table_id, - initializer, embedding_dim, only_var, mode) - - # 提供embedding lookup功能 - # @param table_id int32 类型 - # @param input_ids int64 类型 - # @return values float32 类型 - def embedding_lookup(self, table_id, input_ids): - """ Operator for look up in embedding table. """ - if (table_id is None) or (input_ids is None): - raise ValueError("table_id or input_ids must be specified.") - if not isinstance(table_id, int): - raise ValueError("type of table_id must be int.") - if input_ids.dtype != tf.int64: - raise ValueError("dtype of input_ids must be tf.int64.") - if table_id < 0: - raise ValueError("table_id can not be smaller than zero.") - if not self._init_flag: - raise ValueError("embedding must init first!") - if table_id not in self._table_has_init: - raise ValueError("this table has not yet initialized.") - if self._train_mode: - seed1, seed2 = random_seed.get_seed(None) - result = gen_npu_cpu_ops.embedding_table_find_and_init(table_id=ops.convert_to_tensor(table_id), - keys=input_ids, - embedding_dim= - self._table_to_embedding_dim.get(table_id), - random_alg=self._initializer, - seed=seed1, seed2=seed2, - value_total_len= - self._table_to_embedding_dim.get(table_id) * - (self.slot_vars_num + 1) - ) - else: - result = gen_npu_cpu_ops.embedding_table_find(table_id=ops.convert_to_tensor(table_id), - keys=input_ids, - embedding_dim=self._table_to_embedding_dim.get(table_id)) - result.op._set_attr("_embedding_dim", attr_value_pb2.AttrValue(i=self._embedding_dim)) - result.op._set_attr("_max_num", attr_value_pb2.AttrValue(i=self._max_num)) - result.op._set_attr("_deploy_inject_config", - attr_value_pb2.AttrValue(s=tf.compat.as_bytes(self._es_cluster_conf))) - return result - - # 提供embedding update功能 - # @param loss 类型 - # @param params float32 类型 - # @param table_ids int32 类型 - # @param input_ids_list int64 类型 - def embedding_update(self, loss, params, table_ids, input_ids_list): - """ Operator for update in embedding table. """ - if (loss is None) or (params is None) or (table_ids is None) or (input_ids_list is None): - raise ValueError("loss or params or table_ids or input_ids_list is None.") - if (isinstance(loss, str)) or (isinstance(params, str)) or isinstance(table_ids, str) or \ - isinstance(input_ids_list, str): - raise ValueError("loss, params, table_ids and input_ids_list can not be str.") - if not self._init_flag: - raise ValueError("embedding must init first!") - if (not isinstance(params, (list, tuple)) and not isinstance(table_ids, (list, tuple)) - and not isinstance(input_ids_list, (list, tuple))): - params = [params] - table_ids = [table_ids] - input_ids_list = [input_ids_list] - for table_id in table_ids: - if table_id not in self._table_has_init: - raise ValueError("this table has not yet initialized.") - if (len(params) != len(table_ids)) or (len(params) != len(input_ids_list)) \ - or (len(table_ids) != len(input_ids_list)): - raise ValueError("The length of params, table_ids, input_ids_list should be equal.") - embedding_grads = tf.gradients(loss, params) - params_grads = [] - for i in range(len(embedding_grads)): - params_grads.append(tf.IndexedSlices(embedding_grads[i], input_ids_list[i], dense_shape=params[i].shape)) - with specified_ps_engine_scope(): - var_refs = [NpuEmbeddingResource(table_id) for table_id in table_ids] - update_op = self._optimizer.apply_gradients(list(zip(params_grads, var_refs))) - return update_op - - # 提供训练好的embedding values save功能 - # @param file_path string 类型 - # @param file_name string 类型 - # @param table_id int32 类型 - # @param mode string 类型 - def embedding_save(self, file_path, file_name, table_id, mode="bin"): - """ Operator for save values in embedding table. """ - if file_path is None or file_name is None or table_id is None: - raise ValueError("table_id, embedding table file_name and file_path can not be None.") - if table_id not in self._table_has_init: - raise ValueError("this table has not yet initialized.") - if not os.path.exists(file_path): - os.mkdir(file_path) - with specified_ps_engine_scope(): - embedding_dim = self._table_to_embedding_dim.get(table_id) - return gen_npu_cpu_ops.embedding_table_export(file_path, file_name, ops.convert_to_tensor(-1), table_id, - embedding_dim, embedding_dim, True, mode) - - # 提供训练好的embedding values + 调优参数 save功能 - # @param file_path string 类型 - # @param file_name string 类型 - # @param table_id int32 类型 - # @param mode string 类型 - def embedding_ckpt_save(self, file_path, file_name, table_id, mode="bin"): - """ Operator for save values and optimizer params in embedding table. """ - if file_path is None or file_name is None or table_id is None: - raise ValueError("table_id, embedding table file_name and file_path can not be None.") - if table_id not in self._table_has_init: - raise ValueError("this table has not yet initialized.") - if not os.path.exists(file_path): - os.mkdir(file_path) - with specified_ps_engine_scope(): - embedding_dim = self._table_to_embedding_dim.get(table_id) - return gen_npu_cpu_ops.embedding_table_export(file_path, file_name, ops.convert_to_tensor(-1), table_id, - embedding_dim, embedding_dim * (self.slot_vars_num + 1), - False, mode) - - def data_parallel_embedding(self, max_vocabulary_size, embedding_dim, multihot_lens, allow_merge=True): - if not isinstance(multihot_lens, list): - raise ValueError("multihot_lens must be list.") - new_table_info = dict( - max_vocabulary_size=max_vocabulary_size, - embedding_dim=embedding_dim, - multihot_lens=multihot_lens, - allow_merge=allow_merge - ) - self.user_defined_table_infos.append(new_table_info) - - def init_table(self, table_map_policy=AutoMergeTableMapPolicy()): - self.table_map_policy = table_map_policy - self.table_create_infos = self.table_map_policy.map_table_infos(self.user_defined_table_infos) - for table_info_ in self.table_create_infos: - self.total_variable_table.append(tf.Variable( - tf.random_normal([table_info_['max_vocabulary_size'], table_info_['embedding_dim']], mean=0.0, - stddev=1.0, dtype=tf.float32, seed=1234) - )) - self.total_embedding_count += 1 - - def embeddings_look_up(self, tf_indices): - if self.total_embedding_count != len(self.table_create_infos): - raise ValueError("Must init_table() first!") - (in_slot_size_group, slot_to_table, table_to_input_group, \ - table_to_slot, table_to_output_slots) = \ - (self.table_map_policy.in_slot_size_group, self.table_map_policy.slot_to_table, \ - self.table_map_policy.table_to_input_groups, self.table_map_policy.table_to_slot, \ - self.table_map_policy.table_to_output_slots) - - indices_split = tf.split(tf_indices, in_slot_size_group, axis=1) - for tid in range(self.total_embedding_count): - table_to_input_group[tid] = [] - for sid, indices in enumerate(indices_split): - tid = slot_to_table[sid] - table_to_input_group[tid].append(indices) - - output_slots = [None for _ in in_slot_size_group] - for tid, table_input_group in enumerate(table_to_input_group): - table_input_after_mapping = \ - gen_npu_cpu_ops.embedding_feature_mapping(feature_id=tf.concat(table_input_group, axis=1)) - table_to_input_group[tid] = table_input_after_mapping - table_embedding = tf.nn.embedding_lookup(self.total_variable_table[tid], table_input_after_mapping) - out_embedding_splited = tf.split(table_embedding, table_to_output_slots[tid], axis=1) - for out_emb, sid in zip(out_embedding_splited, table_to_slot[tid]): - output_slots[sid] = out_emb - return output_slots - - def _init_hashmap_and_table_import(self, bucket_size, file_path, file_name, table_id, - initializer, embedding_dim, only_var, mode): - with tf.control_dependencies([self._init_partition_maps.get(table_id)]): - if self._train_mode: - if self._train_level: - seed1, seed2 = random_seed.get_seed(None) - self._init_embedding_hash_maps[table_id] = \ - gen_npu_cpu_ops.init_embedding_hashmap(table_id=ops.convert_to_tensor(table_id), - bucket_size=bucket_size, - value_total_len=embedding_dim * (self.slot_vars_num + 1), - embedding_dim=embedding_dim, - random_alg=initializer, seed=seed1, seed2=seed2) - else: - self._init_embedding_hash_maps[table_id] = \ - gen_npu_cpu_ops.init_embedding_hashmap(table_id=ops.convert_to_tensor(table_id), - bucket_size=bucket_size, - value_total_len=embedding_dim * (self.slot_vars_num + 1), - embedding_dim=embedding_dim, - random_alg=None, seed=None, seed2=None) - else: - self._init_embedding_hash_maps[table_id] = \ - gen_npu_cpu_ops.init_embedding_hashmap(table_id=ops.convert_to_tensor(table_id), - bucket_size=bucket_size, - value_total_len=embedding_dim, - embedding_dim=embedding_dim, - random_alg=None, seed=None, seed2=None) - self._init_flag = True - return self._init_or_restore(file_path, file_name, table_id, embedding_dim, only_var, mode) - - def _init_or_restore(self, file_path, file_name, table_id, embedding_dim, only_var, mode): - if self._train_mode and self._train_level: - return tf.group( - [tf.initializers.variables(self._optimizer.variables()), self._init_embedding_hash_maps.get(table_id)]) - # restore embedding table - with tf.control_dependencies([self._init_embedding_hash_maps.get(table_id)]): - embedding_table_import = gen_npu_cpu_ops.embedding_table_import( - file_path=ops.convert_to_tensor(file_path), - file_name=ops.convert_to_tensor(file_name), - # ps_id will be changed in executor, so can not be optimized in graph - ps_id=ops.convert_to_tensor(-1), - table_id=ops.convert_to_tensor(table_id), - embedding_dim=embedding_dim, - value_total_len=embedding_dim * (self.slot_vars_num + 1), - only_var_flag=only_var, - file_type=mode) - return tf.group([embedding_table_import]) diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_table_map_policy.py b/tf_adapter/python/npu_bridge/embedding/embedding_table_map_policy.py deleted file mode 100644 index c9cf6be1c..000000000 --- a/tf_adapter/python/npu_bridge/embedding/embedding_table_map_policy.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from functools import reduce - - -class BaseTableMapPolicy(): - def __init__(self, assign_groups=None): - self.table_create_infos = [] - if assign_groups is None: - self.assign_groups = [] - else: - self.assign_groups = assign_groups - self.in_slot_size_group = [] - self.slot_to_table = [] - self.table_to_output_slots = [] - self.table_to_input_groups = [] - self.table_to_slot = [] - - @staticmethod - def _is_equal_table_info(info1, info2): - if info1['embedding_dim'] != info2['embedding_dim']: # dim of table is the same or not - print('embedding dim different!, value is %d and %d' % (info1['embedding_dim'], info2['embedding_dim'])) - return False - return True - - def map_table_infos(self, user_defined_table_infos): - raise NotImplementedError() - - def _register_new_table_info(self, new_table_info): - self.table_create_infos.append(new_table_info) - self.table_to_output_slots.append([]) - self.table_to_input_groups.append([]) - self.table_to_slot.append([]) - - def _merge_new_table_info(self, new_table_info, assign_tabld_id): - main_table_info = self.table_create_infos[assign_tabld_id] - main_table_info['multihot_lens'] += new_table_info['multihot_lens'] - main_table_info['max_vocabulary_size'] += new_table_info['max_vocabulary_size'] - - def _register_table_info(self, new_table_info, assign_tid=-1): - multihot_lens = new_table_info['multihot_lens'] - in_slot_size = sum(multihot_lens) - out_slot_size = len(multihot_lens) - - tid = assign_tid - if tid == -1: - tid = len(self.table_create_infos) - self._register_new_table_info(new_table_info) - else: - self._merge_new_table_info(new_table_info, tid) - - self.table_to_slot[tid].append(len(self.in_slot_size_group)) - self.table_to_output_slots[tid].append(in_slot_size) - self.in_slot_size_group.append(in_slot_size) - self.slot_to_table.append(tid) - - def _map_table_infos(self, user_defined_table_infos, assign_groups): - self.table_create_infos = [] - assign_groups_flat = reduce(lambda a, b: a+b, assign_groups, []) - sid_to_gid = reduce(lambda a, b: {**a, **b}, - [{sid: gid for sid in group} - for gid, group in enumerate(assign_groups)], {}) - gid_to_tid = dict() - for sid, table_info in enumerate(user_defined_table_infos): - if sid in assign_groups_flat: - gid = sid_to_gid.get(sid) - if gid in gid_to_tid: - self._register_table_info(table_info, assign_tid=gid_to_tid.get(gid)) - else: - tid = len(self.table_create_infos) - self._register_table_info(table_info, assign_tid=-1) - gid_to_tid[gid] = tid - else: - self._register_table_info(table_info, assign_tid=-1) - return self.table_create_infos - - -# no slot merge -class NoneTableMapPolicy(BaseTableMapPolicy): - def map_table_infos(self, user_defined_table_infos): - return self._map_table_infos(user_defined_table_infos, self.assign_groups) - - -# merge slot by user's assign_groups -class AutoMergeTableMapPolicy(BaseTableMapPolicy): - def map_table_infos(self, user_defined_table_infos): - assign_groups_flat = reduce(lambda a, b: a+b, self.assign_groups, []) - new_assign_groups = [] - for sid, table_info in enumerate(user_defined_table_infos): - if sid in assign_groups_flat: - continue - gid = -1 - if user_defined_table_infos[sid]['allow_merge']: - for ngid, group in enumerate(new_assign_groups): - if self._is_equal_table_info(user_defined_table_infos[group[0]], table_info) \ - and user_defined_table_infos[group[0]]['allow_merge']: - gid = ngid - break - if gid == -1: - gid = len(new_assign_groups) - new_assign_groups.append([]) - new_assign_groups[gid].append(sid) - new_assign_groups = self.assign_groups + new_assign_groups - return self._map_table_infos(user_defined_table_infos, new_assign_groups) diff --git a/tf_adapter/python/npu_bridge/embedding/tf_path.py b/tf_adapter/python/npu_bridge/embedding/tf_path.py deleted file mode 100644 index a5717e652..000000000 --- a/tf_adapter/python/npu_bridge/embedding/tf_path.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from tensorflow.python.eager import context -from tensorflow.python.framework import ops -from tensorflow.python.ops import resource_variable_ops -from tensorflow.python.ops import variables -from tensorflow.python.training import optimizer as embeddingOptimizer -from npu_bridge.embedding.embedding_resource import NpuEmbeddingResource - - -class _NpuEmbeddingResourceProcessor(embeddingOptimizer._OptimizableVariable): - """Processor for dense NpuEmbeddingResourceProcessor.""" - - def __init__(self, v): - self._v = v - - def target(self): - return self._v - - def update_op(self, optimizer, g): - return optimizer._resource_apply_sparse(g.values, self._v, g.indices) - - -def _get_processor(v): - """The processor of v.""" - if context.executing_eagerly(): - if isinstance(v, ops.Tensor): - return embeddingOptimizer._TensorProcessor(v) - else: - return embeddingOptimizer._DenseResourceVariableProcessor(v) - if isinstance(v, NpuEmbeddingResource): - return _NpuEmbeddingResourceProcessor(v) - if resource_variable_ops.is_resource_variable(v) and not v._in_graph_mode: # pylint: disable=protected-access - # True if and only if `v` was initialized eagerly. - return embeddingOptimizer._DenseResourceVariableProcessor(v) - if v.op.type == "VarHandleOp": - return embeddingOptimizer._DenseResourceVariableProcessor(v) - if isinstance(v, variables.Variable): - return embeddingOptimizer._RefVariableProcessor(v) - if isinstance(v, ops.Tensor): - return embeddingOptimizer._TensorProcessor(v) - - raise NotImplementedError("Trying to optimize unsupported type ", v) - - -def path_on_tf(): - embeddingOptimizer._get_processor = _get_processor - - diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 6004fecb7..7209e0953 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -105,7 +105,6 @@ class NPURunConfig(run_config_lib.RunConfig): stream_sync_timeout=-1, event_sync_timeout=-1, external_weight=False, - es_cluster_config=None, deterministic=0, frozen_variable=False, variable_placement="Device" @@ -166,7 +165,6 @@ class NPURunConfig(run_config_lib.RunConfig): experimental_config: The experimental configuration. topo_sorting_mode: Provides an interface for users to customize topology sorting. external_weight: Whether convert const to fileconstant and save weight to file. - es_cluster_config: esClusterConfig from user input in embedding service. frozen_variable: Whether folding constant variables variable_placement: Process variable on host or device """ @@ -256,7 +254,6 @@ class NPURunConfig(run_config_lib.RunConfig): self.stream_sync_timeout = stream_sync_timeout self.event_sync_timeout = event_sync_timeout self._external_weight = external_weight - self.es_cluster_config = es_cluster_config super(NPURunConfig, self).__init__( model_dir=model_dir, diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index c3199d8fb..9db0f67af 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -768,8 +768,6 @@ class NPUEstimator(estimator_lib.Estimator): custom_op.parameter_map["topo_sorting_mode"].i = config.topo_sorting_mode if config.insert_op_file is not None: custom_op.parameter_map["insert_op_file"].s = config.insert_op_file - if config.es_cluster_config is not None: - custom_op.parameter_map["es_cluster_config"].s = tf.compat.as_bytes(config.es_cluster_config) custom_op.parameter_map["stream_sync_timeout"].i = config.stream_sync_timeout custom_op.parameter_map["event_sync_timeout"].i = config.event_sync_timeout custom_op.parameter_map["external_weight"].b = config._external_weight diff --git a/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py b/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py index cf43c5536..1e91cbb8e 100644 --- a/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py +++ b/tf_adapter/python/npu_bridge/npu_cpu/npu_cpu_ops.py @@ -283,102 +283,3 @@ def non_zero_with_value_shape(value, index, count): index=index, count=count) return result - - -class ESWorker: - """ Embedding service class. """ - def __init__(self, es_cluster_config): - with open(es_cluster_config, encoding='utf-8') as a: - es_cluster_config_json = json.load(a) - self._es_cluster_conf = json.dumps(es_cluster_config_json) - self._ps_num = int(es_cluster_config_json["psNum"]) - self._embedding_dim = -1 - self._max_num = -1 - self._ps_ids = [] - self._ps_ids_list = es_cluster_config_json["psCluster"] - for each_ps in self._ps_ids_list: - self._ps_ids.append(each_ps["id"]) - - config = tf.ConfigProto() - custom_op = config.graph_options.rewrite_options.custom_optimizers.add() - custom_op.name = "NpuOptimizer" - custom_op.parameter_map["es_cluster_config"].s = tf.compat.as_bytes(self._es_cluster_conf) - self.es_all_config = config - - ## 提供embedding init功能 - # @param bucket_size int 类型 - # @param file_path string 类型 - # @param file_name string 类型 - # @param table_id uint32 类型 - # @param embedding_dim uint32 类型 - # @param max_batch_size uint32 类型 - def embedding_init(self, bucket_size, file_path, file_name, table_id, embedding_dim, max_batch_size): - """ Operator for init embedding table. """ - self._embedding_dim = embedding_dim - self._max_num = max_batch_size - ps_num = tf.constant(self._ps_num, dtype=tf.uint32, name='ps_num') - ps_ids = tf.constant(self._ps_ids, dtype=tf.uint32, name='ps_ids') - ps_engine_values = "PS" - ps_engine = attr_value_pb2.AttrValue(s=compat.as_bytes(ps_engine_values)) - ps_num.op._set_attr("_process_node_engine_id", ps_engine) - ps_ids.op._set_attr("_process_node_engine_id", ps_engine) - init_partition_map = gen_npu_cpu_ops.init_partition_map(ps_num=ps_num, - ps_ids=ps_ids) - table_id = tf.constant(table_id, dtype=tf.uint32, name='table_id') - table_id.op._set_attr("_process_node_engine_id", ps_engine) - with tf.control_dependencies([init_partition_map]): - init_embedding_hash_map = gen_npu_cpu_ops.init_embedding_hashmap(table_id=table_id, bucket_size=bucket_size) - file_name = tf.constant(file_name, dtype=tf.string, name='file_name') - file_path = tf.constant(file_path, dtype=tf.string, name='file_path') - embedding_dim = tf.constant(embedding_dim, dtype=tf.uint32, name="embedding_dim") - ps_id = -1 - ps_id = tf.constant(ps_id, dtype=tf.uint32, name='ps_id') - ps_id.op._set_attr("_process_node_engine_id", ps_engine) - file_name.op._set_attr("_process_node_engine_id", ps_engine) - file_path.op._set_attr("_process_node_engine_id", ps_engine) - embedding_dim.op._set_attr("_process_node_engine_id", ps_engine) - with tf.control_dependencies([init_embedding_hash_map]): - embedding_table_import = gen_npu_cpu_ops.embedding_table_import(file_path=file_path, - file_name=file_name, - ps_id=ps_id, - table_id=table_id, - embedding_dim=embedding_dim) - init_partition_map._set_attr("_process_node_engine_id", ps_engine) - init_embedding_hash_map._set_attr("_process_node_engine_id", ps_engine) - embedding_table_import._set_attr("_process_node_engine_id", ps_engine) - execute_times_value = 1 - execute_times = attr_value_pb2.AttrValue(i=execute_times_value) - embedding_table_import._set_attr("_execute_times", execute_times) - embedding_dim_value = 1 - embedding_dim = attr_value_pb2.AttrValue(i=embedding_dim_value) - embedding_table_import._set_attr("_embedding_dim", embedding_dim) - max_num_value = self._max_num - max_num = attr_value_pb2.AttrValue(i=max_num_value) - embedding_table_import._set_attr("_max_num", max_num) - deploy_inject_config_value = self._es_cluster_conf - deploy_inject_config = attr_value_pb2.AttrValue(s=compat.as_bytes(deploy_inject_config_value)) - embedding_table_import._set_attr("_deploy_inject_config", deploy_inject_config) - result = embedding_table_import - return result - - # 提供embedding lookup功能 - # @param table_id uint32 类型 - # @param input_ids uint64 类型 - # @return values float32 类型 - def embedding_look_up(self, table_id, input_ids): - """ Operator for look up in embedding table. """ - table_id = tf.constant(table_id, dtype=tf.uint32, name="table_id") - result = gen_npu_cpu_ops.embedding_table_find(table_id=table_id, - keys=input_ids, - embedding_dim=self._embedding_dim) - max_num_value = self._max_num - max_num = attr_value_pb2.AttrValue(i=max_num_value) - result.op._set_attr("_max_num", max_num) - if self._embedding_dim == -1: - self._embedding_dim = 4 - embedding_dim_value = attr_value_pb2.AttrValue(i=self._embedding_dim) - result.op._set_attr("_embedding_dim", embedding_dim_value) - deploy_inject_config_value = self._es_cluster_conf - deploy_inject_config = attr_value_pb2.AttrValue(s=compat.as_bytes(deploy_inject_config_value)) - result.op._set_attr("_deploy_inject_config", deploy_inject_config) - return result \ No newline at end of file diff --git a/tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt b/tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt index 5fd416577..58575959f 100644 --- a/tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt +++ b/tf_adapter/tests/st/kernels/pbtxt/geop.pbtxt @@ -113,24 +113,6 @@ node { s: "dynamic_execute" } } - attr { - key: "_execute_times" - value { - s: "2" - } - } - attr { - key: "_max_num" - value { - s: "1" - } - } - attr { - key: "_embedding_dim" - value { - s: "1" - } - } attr { key: "_dynamic_input" value { diff --git a/tf_adapter/tests/st/optimizers/testcase/get_attr_optimize_pass_test.cc b/tf_adapter/tests/st/optimizers/testcase/get_attr_optimize_pass_test.cc index a6fee094f..0649fc74a 100644 --- a/tf_adapter/tests/st/optimizers/testcase/get_attr_optimize_pass_test.cc +++ b/tf_adapter/tests/st/optimizers/testcase/get_attr_optimize_pass_test.cc @@ -210,9 +210,6 @@ TEST_F(GetAttrOptimizationPassTest, SetAttrTest) { AttrValue insert_op_file = AttrValue(); insert_op_file.set_s("aipp.cfg"); (*custom_config->mutable_parameter_map())["insert_op_file"] = insert_op_file; - AttrValue es_cluster_config = AttrValue(); - es_cluster_config.set_s("es"); - (*custom_config->mutable_parameter_map())["es_cluster_config"] = es_cluster_config; AttrValue external_weight = AttrValue(); external_weight.set_b(true); (*custom_config->mutable_parameter_map())["external_weight"] = external_weight; diff --git a/tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt b/tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt index 0409c9ba4..58575959f 100644 --- a/tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt +++ b/tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt @@ -113,30 +113,6 @@ node { s: "dynamic_execute" } } - attr { - key: "_deploy_inject_config" - value { - s: "deploy_inject_config" - } - } - attr { - key: "_execute_times" - value { - s: "2" - } - } - attr { - key: "_max_num" - value { - s: "1" - } - } - attr { - key: "_embedding_dim" - value { - s: "1" - } - } attr { key: "_dynamic_input" value { diff --git a/tf_adapter/tests/ut/optimizers/testcase/get_attr_optimize_pass_test.cc b/tf_adapter/tests/ut/optimizers/testcase/get_attr_optimize_pass_test.cc index 151838b58..0649fc74a 100644 --- a/tf_adapter/tests/ut/optimizers/testcase/get_attr_optimize_pass_test.cc +++ b/tf_adapter/tests/ut/optimizers/testcase/get_attr_optimize_pass_test.cc @@ -210,9 +210,6 @@ TEST_F(GetAttrOptimizationPassTest, SetAttrTest) { AttrValue insert_op_file = AttrValue(); insert_op_file.set_s("aipp.cfg"); (*custom_config->mutable_parameter_map())["insert_op_file"] = insert_op_file; - AttrValue es_cluster_config = AttrValue(); - es_cluster_config.set_s("esclusterconfig.json"); - (*custom_config->mutable_parameter_map())["es_cluster_config"] = es_cluster_config; AttrValue external_weight = AttrValue(); external_weight.set_b(true); (*custom_config->mutable_parameter_map())["external_weight"] = external_weight; diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index 45de04adf..a071d8091 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -576,7 +576,6 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr std::string aoe_config_file; std::string stream_sync_timeout = "-1"; std::string event_sync_timeout = "-1"; - std::string es_cluster_config; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { (void) ctx->GetAttr("_precision_mode", &precision_mode); @@ -615,7 +614,6 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr (void) ctx->GetAttr("_aoe_config_file", &aoe_config_file); (void) ctx->GetAttr("_stream_sync_timeout", &stream_sync_timeout); (void) ctx->GetAttr("_event_sync_timeout", &event_sync_timeout); - (void) ctx->GetAttr("_es_cluster_config", &es_cluster_config); } if (precision_mode.empty()) { @@ -666,7 +664,6 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr init_options_["ge.aoe_config_file"] = aoe_config_file; init_options_["stream_sync_timeout"] = stream_sync_timeout; init_options_["event_sync_timeout"] = event_sync_timeout; - init_options_["ge.esClusterConfig"] = es_cluster_config; return init_options_; } @@ -1067,7 +1064,6 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & std::string stream_sync_timeout = "-1"; std::string event_sync_timeout = "-1"; std::string external_weight = "0"; - std::string es_cluster_config; std::string graph_parallel_option_path; std::string enable_graph_parallel; @@ -1147,7 +1143,6 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & auto model_deploy_devicelist_value = attrs.Find("_model_deploy_devicelist"); auto topo_sorting_mode_value = attrs.Find("_topo_sorting_mode"); auto insert_op_file_value = attrs.Find("_insert_op_file"); - auto es_cluster_config_value = attrs.Find("_es_cluster_config"); auto resource_config_path_value = attrs.Find("_resource_config_path"); auto aoe_config_file_value = attrs.Find("_aoe_config_file"); auto stream_sync_timeout_value = attrs.Find("_stream_sync_timeout"); @@ -1428,9 +1423,6 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & if (external_weight_value != nullptr) { external_weight = external_weight_value->s(); } - if (es_cluster_config_value != nullptr) { - es_cluster_config = es_cluster_config_value->s(); - } } all_options["variable_format_optimize"] = variable_format_optimize; @@ -1518,8 +1510,6 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & all_options["ge.topoSortingMode"] = topo_sorting_mode; all_options["insert_op_file"] = insert_op_file; all_options["ge.insertOpFile"] = insert_op_file; - all_options["es_cluster_config"] = es_cluster_config; - all_options["ge.esClusterConfig"] = es_cluster_config; all_options["resource_config_path"] = resource_config_path; all_options["ge.aoe_config_file"] = aoe_config_file; all_options["aoe_config_file"] = aoe_config_file; @@ -2063,11 +2053,6 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options if (params.count("external_weight") > 0) { external_weight = params.at("external_weight").b(); } - if (params.count("es_cluster_config") > 0) { - std::string es_cluster_config = params.at("es_cluster_config").s(); - init_options_["es_cluster_config"] = es_cluster_config; - init_options_["ge.esClusterConfig"] = es_cluster_config; - } if (params.count("frozen_variable") > 0) { frozen_variable = params.at("frozen_variable").b(); } -- Gitee From eb4c01a3f7e893cbc855dde69c8ef037e5dfc096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=86=E8=A1=A1?= Date: Thu, 16 Mar 2023 09:19:52 +0000 Subject: [PATCH 02/22] =?UTF-8?q?!2109=20=E4=BF=AE=E5=A4=8D=E6=B7=B7?= =?UTF-8?q?=E5=90=88=E8=AE=A1=E7=AE=97=E7=9A=84bug+=E5=88=A0=E9=99=A4estim?= =?UTF-8?q?ator=E6=A8=A1=E5=BC=8F=E4=B8=8B=E5=BA=9F=E5=BC=83=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E9=BB=98=E8=AE=A4=E5=80=BC=20Merge=20pull=20request?= =?UTF-8?q?=20!2109=20from=20=E9=99=86=E8=A1=A1/cherry-pick-1678956907?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/interface_spec/api_npu_config.pyh | 2 +- tf_adapter/optimizers/om_partition_subgraphs_pass.cc | 4 +++- tf_adapter/python/npu_bridge/estimator/npu/npu_config.py | 2 +- tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py | 3 ++- tf_adapter/util/util.cc | 1 - 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh index 718db2ac6..815cd35a9 100644 --- a/tf_adapter/interface_spec/api_npu_config.pyh +++ b/tf_adapter/interface_spec/api_npu_config.pyh @@ -12,7 +12,7 @@ class NPURunConfig(run_config_lib.RunConfig): enable_exception_dump=0, op_select_implmode=None, optypelist_for_implmode=None, dynamic_input_config=None, aoe_mode=None, work_path=None, buffer_optimize="l2_optimize", enable_small_channel=0, fusion_switch_file=None, enable_compress_weight=False, compress_weight_conf=None, - op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=False, + op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=None, dynamic_graph_execute_mode="dynamic_execute", dynamic_inputs_shape_range=None, train_distribute=None, eval_distribute=None, local_rank_id=None, local_device_list=None, session_device_id=None, distribute_config=None, modify_mixlist=None, op_precision_mode=None, device_type="default_device_type", diff --git a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc index bb54e52d8..66fd524ad 100644 --- a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc +++ b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc @@ -2244,7 +2244,9 @@ Status OMPartitionSubgraphsPass::ProcessGraph(std::unique_ptr *graph, Fun return Status::OK(); } if (mix_compile_mode) { - TF_RETURN_IF_ERROR(CopyVarsBetweenGeOp(graph_in)); + if (pass_options["variable_location"] != "Host") { + TF_RETURN_IF_ERROR(CopyVarsBetweenGeOp(graph_in)); + } TF_RETURN_IF_ERROR(CopyConstBetweenGeOp(graph_in)); } diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 7209e0953..50536b343 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -78,7 +78,7 @@ class NPURunConfig(run_config_lib.RunConfig): op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, - dynamic_input=False, + dynamic_input=None, dynamic_graph_execute_mode="dynamic_execute", dynamic_inputs_shape_range=None, train_distribute=None, diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index 9db0f67af..eff80548b 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -740,7 +740,8 @@ class NPUEstimator(estimator_lib.Estimator): if config._debug_dir is not None: custom_op.parameter_map["debug_dir"].s = tf.compat.as_bytes(config._debug_dir) custom_op.parameter_map["hcom_multi_mode"].b = config._hcom_multi_mode - custom_op.parameter_map["dynamic_input"].b = config._dynamic_input + if config._dynamic_input is not None: + custom_op.parameter_map["dynamic_input"].b = config._dynamic_input custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes(config._dynamic_graph_execute_mode) if config._dynamic_inputs_shape_range is not None: custom_op.parameter_map["dynamic_inputs_shape_range"].s = tf.compat.as_bytes( diff --git a/tf_adapter/util/util.cc b/tf_adapter/util/util.cc index f7a1e8c81..8cb0b1fc3 100644 --- a/tf_adapter/util/util.cc +++ b/tf_adapter/util/util.cc @@ -115,7 +115,6 @@ bool IsVariableOrResourceVariable(const Node * const node) { bool IsVariableExecuteOnHost(const Node * const node, const std::string &variable_location) { if (variable_location == "Host" && IsVariableOrResourceVariable(node)) { - ADP_LOG(INFO) << "Node : " << node->name() << " op name : " << node->type_string() << "is execute on host"; return true; } return false; -- Gitee From e21c15f373b1bfb8b55d2cd53bdc6b5034070778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=BC=BA?= Date: Thu, 16 Mar 2023 09:28:55 +0000 Subject: [PATCH 03/22] =?UTF-8?q?!2110=20OBP=20=E5=95=86=E5=88=86=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=20=E5=88=A0=E9=99=A4embedding=E5=BC=95=E5=85=A5?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98=20Merge=20pull=20request=20!2110=20?= =?UTF-8?q?from=20=E5=88=98=E5=BC=BA/lq=5F316=5Flast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/python/npu_bridge/npu_init.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tf_adapter/python/npu_bridge/npu_init.py b/tf_adapter/python/npu_bridge/npu_init.py index 7fce9a40e..02f31f04c 100644 --- a/tf_adapter/python/npu_bridge/npu_init.py +++ b/tf_adapter/python/npu_bridge/npu_init.py @@ -65,7 +65,6 @@ from hccl.manage.api import get_world_rank_from_group_rank from hccl.manage.api import get_group_rank_from_world_rank from hccl.split.api import set_split_strategy_by_idx from hccl.split.api import set_split_strategy_by_size -from npu_bridge import embedding as npu_embedding import tensorflow as tf -- Gitee From 52c48c2a38c5443b64419ffd083226fcdf50a05c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AD=8F=E4=B8=B9=E4=B8=B9?= Date: Thu, 16 Mar 2023 12:48:37 +0000 Subject: [PATCH 04/22] =?UTF-8?q?!2112=20=E6=B8=85=E7=90=86=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=20Merge=20pull=20request=20!2112=20from=20=E9=AD=8F?= =?UTF-8?q?=E4=B8=B9=E4=B8=B9/flc29?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/kernels/aicpu/host_queue_dataset_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc index b46c6e0e8..f661ffdc5 100644 --- a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc +++ b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc @@ -597,8 +597,8 @@ class HostQueueDatasetOp : public DatasetOpKernel { ADP_LOG(INFO) << "Slave SendDataThread exit."; } - void RecordMbufQueueBytes(const bool is_hold_type, const uint64_t args_total_bytes) { - if (!is_hold_type) { return; } + void RecordMbufQueueBytes(const bool is_hold, const uint64_t args_total_bytes) { + if (!is_hold) { return; } mbuf_queue_rear_ = (mbuf_queue_rear_ + 1) % kStringTypeDepth; mbuf_queue_bytes_[mbuf_queue_rear_] = args_total_bytes; } @@ -629,7 +629,7 @@ class HostQueueDatasetOp : public DatasetOpKernel { Status status = Status::OK(); bool is_need_resend = false; - while(!finish_send_) { + while (!finish_send_) { if (IsHoldDataTrans()) { auto start = std::chrono::high_resolution_clock::now(); auto end = start + std::chrono::microseconds(kSleepDuration); -- Gitee From 3464689d58eab9a8f2b04862a8c9ef0585061a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=86=E8=A1=A1?= Date: Tue, 21 Mar 2023 07:22:49 +0000 Subject: [PATCH 05/22] =?UTF-8?q?!2135=20frozen=20variable=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E6=A0=A1=E9=AA=8C=20Merge=20pull=20request=20!2135=20?= =?UTF-8?q?from=20=E9=99=86=E8=A1=A1/cherry-pick-1679301364?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../testcase/frozen_variable_pass_test.cc | 13 +++-- .../tests/st/util/testcase/npu_attrs_test.cc | 8 +++- .../testcase/frozen_variable_pass_test.cc | 47 ++++++++++++------- .../tests/ut/util/testcase/npu_attrs_test.cc | 14 ++++-- tf_adapter/util/npu_attrs.cc | 19 ++++++-- 5 files changed, 71 insertions(+), 30 deletions(-) diff --git a/tf_adapter/tests/st/optimizers/testcase/frozen_variable_pass_test.cc b/tf_adapter/tests/st/optimizers/testcase/frozen_variable_pass_test.cc index 1176d7738..b16612497 100644 --- a/tf_adapter/tests/st/optimizers/testcase/frozen_variable_pass_test.cc +++ b/tf_adapter/tests/st/optimizers/testcase/frozen_variable_pass_test.cc @@ -84,7 +84,7 @@ public: return strings::StrCat(absl::StrJoin(edges, ";")); } - string DoRunFrozenVariablePassTest(bool need_frozen) { + string DoRunFrozenVariablePassTest(bool need_frozen, const string &placement) { string before = CanonicalGraphString(graph_.get()); LOG(INFO) << "Before replace variable pass: " << before; @@ -95,7 +95,10 @@ public: custom_config->set_name("NpuOptimizer"); AttrValue is_need_frozen = AttrValue(); is_need_frozen.set_b(need_frozen); + AttrValue variable_placement = AttrValue(); + variable_placement.set_s(placement); (*custom_config->mutable_parameter_map())["frozen_variable"] = is_need_frozen; + (*custom_config->mutable_parameter_map())["variable_placement"] = variable_placement; options.session_options = &session_options; options.graph = ug; FunctionLibraryDefinition flib_def((*ug)->flib_def()); @@ -127,21 +130,21 @@ TEST_F(FrozenVariablePassTest, frozen_variable_true) { string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt"; InitGraph(org_graph_def_path); std::string target_graph = "Const->Add;Add->_Retval;PartitionedCall->Add:1"; - EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph); + EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph); } TEST_F(FrozenVariablePassTest, frozen_variable_false) { string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt"; InitGraph(org_graph_def_path); std::string target_graph = "VariableV2->Identity;Const->Add;Identity->Add:1;Add->_Retval"; - EXPECT_EQ(DoRunFrozenVariablePassTest(false), target_graph); + EXPECT_EQ(DoRunFrozenVariablePassTest(false, "Host"), target_graph); } TEST_F(FrozenVariablePassTest, frozen_varhandleop_true) { string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/varhandleop_test.pbtxt"; InitGraph(org_graph_def_path); std::string target_graph = "Const->Add;Add->_Retval;PartitionedCall->Add:1"; - EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph); + EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph); } TEST_F(FrozenVariablePassTest, frozen_no_variable_true) { @@ -151,7 +154,7 @@ TEST_F(FrozenVariablePassTest, frozen_no_variable_true) { "Less->LoopCond;Merge->Switch;LoopCond->Switch:1;Switch->Exit;Exit->_Retval;" "Switch:1->Identity;Identity:control->Const:control;Const->Add;Identity->Add:1;" "Add->NextIteration;NextIteration->Merge:1"; - EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph); + EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph); } } // end namespace } \ No newline at end of file diff --git a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc index 1b9b91e13..affdb61c4 100644 --- a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc +++ b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc @@ -11,6 +11,7 @@ namespace tensorflow { Status CheckOpImplMode(const string &op_select_implmode); +Status CheckVariablePlacement(const std::string &variable_placement); namespace { class NpuAttrTest : public testing::Test { protected: @@ -64,7 +65,7 @@ TEST_F(NpuAttrTest, SplitTest) { Split(s, res, ","); EXPECT_EQ(res[2], "c"); } -TEST_F(NpuAttrTest, SetNpuOptimizerAttr) { +TEST_F(NpuAttrTest, CheckOpImplMode) { Status s = CheckOpImplMode("xxx"); EXPECT_EQ(s.ok(), false); } @@ -100,6 +101,11 @@ TEST_F(NpuAttrTest, CheckPrecisionMode ) { EXPECT_EQ(s.ok(), false); } +TEST_F(NpuAttrTest, CheckVariablePlacement) { + Status s = CheckVariablePlacement("sss"); + EXPECT_EQ(s.ok(), false); +} + TEST_F(NpuAttrTest, GetDumpPath) { setenv("DUMP_GRAPH_PATH", "./", 1); string path = GetDumpPath(); diff --git a/tf_adapter/tests/ut/optimizers/testcase/frozen_variable_pass_test.cc b/tf_adapter/tests/ut/optimizers/testcase/frozen_variable_pass_test.cc index 1176d7738..8dc75d988 100644 --- a/tf_adapter/tests/ut/optimizers/testcase/frozen_variable_pass_test.cc +++ b/tf_adapter/tests/ut/optimizers/testcase/frozen_variable_pass_test.cc @@ -16,20 +16,28 @@ namespace tensorflow { class FrozenVariablePass : public GraphOptimizationPass { public: FrozenVariablePass() = default; + ~FrozenVariablePass() override = default; + Status Run(const GraphOptimizationPassOptions &options) override; + private: std::map GetGraphConfigs(const Graph &graph); + Status DoConstantFolding(const GraphOptimizationPassOptions &options, const uint64_t index); - bool IsAllOutputsIdentity(const Node * const node); - bool IsAllOutputsReadOp(const Node * const node); - bool IsNeedBuildPartitionedCall(const Node * const node); + + bool IsAllOutputsIdentity(const Node *const node); + + bool IsAllOutputsReadOp(const Node *const node); + + bool IsNeedBuildPartitionedCall(const Node *const node); }; namespace { class FrozenVariablePassTest : public testing::Test { -public: + public: FrozenVariablePassTest() : graph_(absl::make_unique(OpRegistry::Global())) {} + static void InitGraph(const string &graph_def_path, Graph *graph) { GraphDef graph_def; ReadTextProto(Env::Default(), graph_def_path, &graph_def); @@ -38,7 +46,7 @@ public: } void InitGraph(const string &graph_def_path) { - char trusted_path[MMPA_MAX_PATH] = { "\0" }; + char trusted_path[MMPA_MAX_PATH] = {"\0"}; if (mmRealPath(graph_def_path.c_str(), trusted_path, MMPA_MAX_PATH) != EN_OK) { LOG(ERROR) << "Get real path failed."; return; @@ -50,7 +58,7 @@ public: static bool IncludeNode(const Node *n) { return n->IsOp(); } - static string EdgeId(const Node* n, int index) { + static string EdgeId(const Node *n, int index) { if (index == 0) { return n->type_string(); } else if (index == Graph::kControlSlot) { @@ -60,8 +68,8 @@ public: } } - string CanonicalGraphString(Graph* g) { - for (Node* n : g->nodes()) { + string CanonicalGraphString(Graph *g) { + for (Node *n : g->nodes()) { if (IncludeNode(n)) { if (n->type_string() == "Add" && n->assigned_device_name().empty()) { n->set_assigned_device_name("/job:localhost/replica:0/task:0/device:CPU:0"); @@ -74,7 +82,7 @@ public: } std::vector edges; - for (const Edge* e : g->edges()) { + for (const Edge *e : g->edges()) { if (IncludeNode(e->src()) && IncludeNode(e->dst())) { edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->", EdgeId(e->dst(), e->dst_input()))); @@ -84,28 +92,32 @@ public: return strings::StrCat(absl::StrJoin(edges, ";")); } - string DoRunFrozenVariablePassTest(bool need_frozen) { + string DoRunFrozenVariablePassTest(bool need_frozen, const string &placement) { string before = CanonicalGraphString(graph_.get()); LOG(INFO) << "Before replace variable pass: " << before; std::unique_ptr *ug = &graph_; GraphOptimizationPassOptions options; SessionOptions session_options; - auto *custom_config = session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers(); + auto *custom_config = + session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers(); custom_config->set_name("NpuOptimizer"); AttrValue is_need_frozen = AttrValue(); is_need_frozen.set_b(need_frozen); + AttrValue variable_placement = AttrValue(); + variable_placement.set_s(placement); (*custom_config->mutable_parameter_map())["frozen_variable"] = is_need_frozen; + (*custom_config->mutable_parameter_map())["variable_placement"] = variable_placement; options.session_options = &session_options; options.graph = ug; FunctionLibraryDefinition flib_def((*ug)->flib_def()); options.flib_def = &flib_def; DeviceSet device_set; - DeviceFactory* cpu_factory = DeviceFactory::GetFactory("CPU"); + DeviceFactory *cpu_factory = DeviceFactory::GetFactory("CPU"); std::vector> devices; cpu_factory->CreateDevices( - session_options, "/job:localhost/replica:0/task:0", &devices); + session_options, "/job:localhost/replica:0/task:0", &devices); device_set.AddDevice(devices.begin()->get()); options.device_set = &device_set; FrozenVariablePass().Run(options); @@ -120,6 +132,7 @@ public: string original_; protected: virtual void SetUp() { *const_cast(&kDumpGraph) = true; } + virtual void TearDown() {} }; @@ -127,21 +140,21 @@ TEST_F(FrozenVariablePassTest, frozen_variable_true) { string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt"; InitGraph(org_graph_def_path); std::string target_graph = "Const->Add;Add->_Retval;PartitionedCall->Add:1"; - EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph); + EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph); } TEST_F(FrozenVariablePassTest, frozen_variable_false) { string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt"; InitGraph(org_graph_def_path); std::string target_graph = "VariableV2->Identity;Const->Add;Identity->Add:1;Add->_Retval"; - EXPECT_EQ(DoRunFrozenVariablePassTest(false), target_graph); + EXPECT_EQ(DoRunFrozenVariablePassTest(false, "Host"), target_graph); } TEST_F(FrozenVariablePassTest, frozen_varhandleop_true) { string org_graph_def_path = "tf_adapter/tests/st/optimizers/pbtxt/varhandleop_test.pbtxt"; InitGraph(org_graph_def_path); std::string target_graph = "Const->Add;Add->_Retval;PartitionedCall->Add:1"; - EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph); + EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph); } TEST_F(FrozenVariablePassTest, frozen_no_variable_true) { @@ -151,7 +164,7 @@ TEST_F(FrozenVariablePassTest, frozen_no_variable_true) { "Less->LoopCond;Merge->Switch;LoopCond->Switch:1;Switch->Exit;Exit->_Retval;" "Switch:1->Identity;Identity:control->Const:control;Const->Add;Identity->Add:1;" "Add->NextIteration;NextIteration->Merge:1"; - EXPECT_EQ(DoRunFrozenVariablePassTest(true), target_graph); + EXPECT_EQ(DoRunFrozenVariablePassTest(true, "Host"), target_graph); } } // end namespace } \ No newline at end of file diff --git a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc index dc796242b..c3d2c71ab 100644 --- a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc +++ b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc @@ -11,6 +11,7 @@ namespace tensorflow { Status CheckOpImplMode(const string &op_select_implmode); +Status CheckVariablePlacement(const std::string &variable_placement); namespace { class NpuAttrTest : public testing::Test { protected: @@ -41,10 +42,10 @@ TEST_F(NpuAttrTest, GetEnvDeviceIdNotIntFailTest) { EXPECT_EQ(s.ok(), false); } TEST_F(NpuAttrTest, GetEnvAscendDeviceIdNotIntFailTest) { -uint32_t device_id = 0; -setenv("ASCEND_DEVICE_ID", "1.1", true); -Status s = GetEnvDeviceID(device_id); -EXPECT_EQ(s.ok(), false); + uint32_t device_id = 0; + setenv("ASCEND_DEVICE_ID", "1.1", true); + Status s = GetEnvDeviceID(device_id); + EXPECT_EQ(s.ok(), false); } TEST_F(NpuAttrTest, GetEnvDeviceIdEmptyTest) { uint32_t device_id = 0; @@ -101,6 +102,11 @@ TEST_F(NpuAttrTest, CheckPrecisionMode ) { EXPECT_EQ(s.ok(), false); } +TEST_F(NpuAttrTest, CheckVariablePlacement) { + Status s = CheckVariablePlacement("sss"); + EXPECT_EQ(s.ok(), false); +} + TEST_F(NpuAttrTest, GetDumpPath) { setenv("DUMP_GRAPH_PATH", "./", 1); string path = GetDumpPath(); diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index a071d8091..689a406b8 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -247,7 +247,6 @@ inline Status CheckPath(const std::string &input, std::string &output) { Status CheckOpImplMode(const std::string &op_select_implmode) { std::set op_impl_mode_list = {"high_precision", "high_performance", "high_precision_for_all", "high_performance_for_all"}; - if (op_impl_mode_list.find(op_select_implmode) != op_impl_mode_list.end()) { return Status::OK(); } else { @@ -256,6 +255,15 @@ Status CheckOpImplMode(const std::string &op_select_implmode) { } } +Status CheckVariablePlacement(const std::string &variable_placement) { + std::set variable_placement_list = {"Host", "Device"}; + if (variable_placement_list.find(variable_placement) != variable_placement_list.end()) { + return Status::OK(); + } else { + return errors::InvalidArgument("variable placement should be one of the list:[Host, Device]"); + } +} + inline Status CheckAoeMode(const std::string &aoe_mode) { std::set aoe_mode_list = {"1", "2", "4", "mdat"}; @@ -766,6 +774,11 @@ std::map NpuAttrs::GetPassOptions(const GraphOptimizat } if (params.count("variable_placement") > 0) { variable_location = params.at("variable_placement").s(); + Status s = CheckVariablePlacement(variable_location); + if (!s.ok()) { + ADP_LOG(ERROR) << s.error_message(); + LOG(FATAL) << s.error_message(); + } } } } @@ -821,8 +834,8 @@ std::map NpuAttrs::GetPassOptions(const OpKernelConstr std::string in_out_pair_flag = "1"; std::string in_out_pair; std::string npuOptimizer; - std::string frozen_variable = "0"; std::string variable_location = "Device"; + std::string frozen_variable = "0"; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { do_npu_optimizer = "1"; @@ -2195,7 +2208,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options pass_options["local_device_list"] = local_device_list; pass_options["in_out_pair_flag"] = std::to_string(static_cast(in_out_pair_flag)); pass_options["in_out_pair"] = in_out_pair; - pass_options["frozen_variable"] = frozen_variable; + pass_options["frozen_variable"] = std::to_string(static_cast(frozen_variable)); pass_options["variable_location"] = variable_location; if (!node) { -- Gitee From 975606f6971f1a01a78acd24aa52a270ecb5414b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=81=E6=98=8A?= Date: Tue, 21 Mar 2023 08:54:43 +0000 Subject: [PATCH 06/22] =?UTF-8?q?!2120=20misra=20=20zhaolupeng=20c29=20Mer?= =?UTF-8?q?ge=20pull=20request=20!2120=20from=20=E6=A2=81=E6=98=8A/cherry-?= =?UTF-8?q?pick-1679022854?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/optimizers/meta/npu_hcom_tailing_optimizer.cpp | 2 +- .../meta/npu_weight_update_grouping_optimizer.cpp | 4 ++-- .../npu_device/core/optimizers/runtime/node_placer.cpp | 6 +++--- .../core/optimizers/runtime/npu_build_npu_op_optimizer.cpp | 4 ++-- .../runtime/npu_trans_resource_input_to_node_optimizer.cpp | 3 ++- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_hcom_tailing_optimizer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_hcom_tailing_optimizer.cpp index afee34778..50634d802 100644 --- a/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_hcom_tailing_optimizer.cpp +++ b/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_hcom_tailing_optimizer.cpp @@ -56,7 +56,7 @@ tensorflow::Status TailingOptimizeInner(tensorflow::FunctionLibraryDefinition *l } } } - if (node->type_string() == kNpuAllocFloatStatusOp && node->attrs().Find(kNpuLossScaleAttr) != nullptr) { + if ((node->type_string() == kNpuAllocFloatStatusOp) && (node->attrs().Find(kNpuLossScaleAttr) != nullptr)) { std::unordered_set edges_to_remove; tensorflow::Node *last_allreduce = nullptr; for (auto in_edge : node->in_edges()) { diff --git a/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_weight_update_grouping_optimizer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_weight_update_grouping_optimizer.cpp index d36a3ca41..a10281e09 100644 --- a/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_weight_update_grouping_optimizer.cpp +++ b/tf_adapter_2.x/npu_device/core/optimizers/meta/npu_weight_update_grouping_optimizer.cpp @@ -57,7 +57,7 @@ tensorflow::Status WeightUpdateGroupingOptimizeInner(tensorflow::FunctionLibrary } } - if (node->type_string() == kHcomBroadcast && node->attrs().Find(kWeightUpdateGroupingAttr) != nullptr) { + if ((node->type_string() == kHcomBroadcast) && (node->attrs().Find(kWeightUpdateGroupingAttr) != nullptr)) { std::unordered_set edges_to_remove; tensorflow::Node *read_var_node = nullptr; for (auto in_edge : node->in_edges()) { @@ -114,7 +114,7 @@ tensorflow::Status WeightUpdateGroupingOptimizeInner(tensorflow::FunctionLibrary (void)graph->AddEdge(var_node, 0, new_read_var_node, 0); (void)graph->AddEdge(new_read_var_node, 0, node, 0); for (auto var_edge : var_node->out_edges()) { - if (var_edge->dst() != new_read_var_node && var_edge->dst() != assign_node) { + if ((var_edge->dst() != new_read_var_node) && (var_edge->dst() != assign_node)) { (void)graph->AddControlEdge(assign_node, var_edge->dst()); } } diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp index 92d8c163b..5549135b7 100644 --- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp +++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp @@ -316,7 +316,7 @@ tensorflow::Status NodePlacer::PlaceCpuNodeSubgraphs(size_t depth) const { bool NodePlacer::IsClusterMustPlaceOnNpu(const Cluster &cluster) { for (auto node : cluster.nodes) { auto iter = node_placement_.find(node); - if (iter != node_placement_.end() && iter->second == Placement::NPU) { + if ((iter != node_placement_.end()) && (iter->second == Placement::NPU)) { DLOG() << cluster.name << " must place on npu as has determined npu node " << node->name(); return true; } @@ -405,7 +405,7 @@ void NodePlacer::Concrete(tensorflow::Node *src, tensorflow::Node *dst) { DLOG() << "Concrete node " << src->name() << " with " << dst->name() << " to cluster " << target->name; auto iter = concrete_clusters_.find(src); - if (iter != concrete_clusters_.end() && iter->second == target) { + if ((iter != concrete_clusters_.end()) && (iter->second == target)) { DLOG() << "Node " << src->name() << " has already concrete with " << dst->name() << " in cluster " << target->name; return; } @@ -468,7 +468,7 @@ tensorflow::Status NodePlacer::BuildConcreteCluster() { std::queue> q; for (auto &node : cluster->nodes) { auto iter = concrete_clusters_.find(node); - if (iter != concrete_clusters_.end() && iter->second != cluster) { + if ((iter != concrete_clusters_.end()) && (iter->second != cluster)) { q.push(iter->second); } } diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_build_npu_op_optimizer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_build_npu_op_optimizer.cpp index 5abc96bfe..dd26e7985 100644 --- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_build_npu_op_optimizer.cpp +++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_build_npu_op_optimizer.cpp @@ -113,7 +113,7 @@ tensorflow::Status TryToBuildShapeForDynDims(const std::mapMutableGraph()), key) || key != nullptr) { + if (IsGraphNeedLoop(*(graph->MutableGraph()), key) || (key != nullptr)) { graph->SetLoopType(NpuConcreteGraph::LoopType::BUILTIN_LOOP); } graph->SetExecutionType(NpuConcreteGraph::ExecutionType::MIX); diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp index 2bd9a99c4..6cb0a2bc5 100644 --- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp +++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp @@ -180,7 +180,8 @@ tensorflow::Status TransHasSubgraphNode(TFE_Context *context, tensorflow::Graph functions.emplace_back(const_cast(node->attrs().Find("then_branch"))->mutable_func()); functions.emplace_back(const_cast(node->attrs().Find("else_branch"))->mutable_func()); } else if (node->IsCaseNode()) { - for (auto &f : *const_cast(node->attrs().Find("branches"))->mutable_list()->mutable_func()) { + for (auto &f : + *const_cast(node->attrs().Find("branches"))->mutable_list()->mutable_func()) { functions.emplace_back(&f); } } else { -- Gitee From 20922eec6d1d0f13195abbac7231bf891d614397 Mon Sep 17 00:00:00 2001 From: xujiuxu Date: Thu, 23 Mar 2023 06:19:10 +0000 Subject: [PATCH 07/22] =?UTF-8?q?!2148=20=E5=91=8A=E8=AD=A6=E6=B8=85?= =?UTF-8?q?=E7=90=86=20Merge=20pull=20request=20!2148=20from=20xujiuxu/che?= =?UTF-8?q?rry-pick-1679477162?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/optimizers/frozen_variable_pass.cc | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tf_adapter/optimizers/frozen_variable_pass.cc b/tf_adapter/optimizers/frozen_variable_pass.cc index c90832d11..02582727e 100644 --- a/tf_adapter/optimizers/frozen_variable_pass.cc +++ b/tf_adapter/optimizers/frozen_variable_pass.cc @@ -47,11 +47,11 @@ class FrozenVariablePass : public GraphOptimizationPass { ~FrozenVariablePass() override = default; Status Run(const GraphOptimizationPassOptions &options) override; private: - bool IsAllOutputsIdentity(const Node * const node); - bool IsAllOutputsReadOp(const Node * const node); + bool IsAllOutputsIdentity(const Node * const node) const; + bool IsAllOutputsReadOp(const Node * const node) const; bool IsNeedBuildPartitionedCall(const Node * const node); - std::map GetGraphConfigs(const Graph &graph); - void RemoveDeadNodes(Graph* g); + std::map GetGraphConfigs(const Graph &graph) const; + void RemoveDeadNodes(Graph* g) const; Status DoConstantFolding(const GraphOptimizationPassOptions &options, const uint64_t index); }; @@ -61,7 +61,7 @@ struct StableNodeCompartor { DataType EdgeDataType(const tensorflow::Edge &edge) { return edge.src()->output_type(edge.src_output()); } -bool FrozenVariablePass::IsAllOutputsIdentity(const Node * const node) { +bool FrozenVariablePass::IsAllOutputsIdentity(const Node * const node) const { for (auto out : node->out_nodes()) { if (!out->IsIdentity()) { return false; @@ -70,7 +70,7 @@ bool FrozenVariablePass::IsAllOutputsIdentity(const Node * const node) { return true; } -bool FrozenVariablePass::IsAllOutputsReadOp(const Node * const node) { +bool FrozenVariablePass::IsAllOutputsReadOp(const Node * const node) const { for (auto out : node->out_nodes()) { if (out->type_string() != "ReadVariableOp") { return false; @@ -84,7 +84,7 @@ bool FrozenVariablePass::IsNeedBuildPartitionedCall(const Node * const node) { (node->type_string() == "VarHandleOp" && IsAllOutputsReadOp(node)); } -std::map FrozenVariablePass::GetGraphConfigs(const Graph &graph) { +std::map FrozenVariablePass::GetGraphConfigs(const Graph &graph) const { for (Node *n : graph.nodes()) { if ((n != nullptr) && (n->attrs().Find("_NpuOptimizer") != nullptr)) { return NpuAttrs::GetAllAttrOptions(n->attrs()); @@ -93,7 +93,7 @@ std::map FrozenVariablePass::GetGraphConfigs(const Gra return {}; } -void FrozenVariablePass::RemoveDeadNodes(Graph* g) { +void FrozenVariablePass::RemoveDeadNodes(Graph* g) const { std::unordered_set nodes; for (auto n : g->nodes()) { ADP_LOG(DEBUG) << "Remove dead node, node type: " << n->type_string(); @@ -106,7 +106,7 @@ void FrozenVariablePass::RemoveDeadNodes(Graph* g) { Status FrozenVariablePass::DoConstantFolding(const GraphOptimizationPassOptions &options, const uint64_t index) { - ADP_LOG(INFO) << "Before do const folding" << options.session_options->config.DebugString(); + ADP_LOG(INFO) << "Before do const folding " << options.session_options->config.DebugString(); if (options.device_set == nullptr) { return errors::Internal("Failed to get device set to run constant folding"); } @@ -139,7 +139,7 @@ Status FrozenVariablePass::DoConstantFolding(const GraphOptimizationPassOptions GraphOptimizer optimizer(opts); optimizer.Optimize(flr, flr->env(), flr->device(), options.graph, graph_optimizer_options); (void)RemoveDeadNodes((options.graph)->get()); - ADP_LOG(INFO) << "After do const folding optimize"; + ADP_LOG(INFO) << "After do const folding optimize."; if (kDumpGraph) { const std::string pbtxt_path = GetDumpPath() + "TF_AfterFrozenVariable_" + std::to_string(index) + ".pbtxt"; tensorflow::GraphDef def; -- Gitee From a83df9cdfeee8370ef329a9839dea1ea40b8c9da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=93=E6=B6=9B?= Date: Thu, 23 Mar 2023 09:26:40 +0000 Subject: [PATCH 08/22] =?UTF-8?q?!2154=20tf2.x=E5=8E=BBranktable=20Merge?= =?UTF-8?q?=20pull=20request=20!2154=20from=20=E9=82=93=E6=B6=9B/cherry-pi?= =?UTF-8?q?ck-1679559289?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../inc/external/ge/ge_api_types.h | 7 +++ .../npu_device/core/npu_wrapper.cpp | 17 ++++-- .../npu_device/distribute/npu_callbacks.py | 4 +- .../python/npu_device/npu_device.py | 52 +++++++++++++++---- tf_adapter_2.x/tests/st/adapter2_st.py | 8 +-- .../tests/stub/include/stub/defines.h | 7 +++ 6 files changed, 77 insertions(+), 18 deletions(-) diff --git a/inc/graphengine/inc/external/ge/ge_api_types.h b/inc/graphengine/inc/external/ge/ge_api_types.h index 8efcef921..e2d720bef 100644 --- a/inc/graphengine/inc/external/ge/ge_api_types.h +++ b/inc/graphengine/inc/external/ge/ge_api_types.h @@ -43,6 +43,13 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode"; const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile"; const char *const GE_AICPU_FLAG = "ge.aicpuFlag"; const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath"; + +const std::string OPTION_EXEC_CM_CHIEF_IP = "ge.cmChiefIp"; +const std::string OPTION_EXEC_CM_CHIEF_PORT = "ge.cmChiefPort"; +const std::string OPTION_EXEC_CM_CHIEF_DEVICE = "ge.cmChiefWorkerDevice"; +const std::string OPTION_EXEC_CM_WORKER_IP = "ge.cmWorkerIp"; +const std::string OPTION_EXEC_CM_WORKER_SIZE = "ge.cmWorkerSize"; + // Dump flag and para const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump"; const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath"; diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index 7ffadcdd7..675ee05e7 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -117,7 +117,12 @@ const std::map kConfigurableOptions = { {"graph_parallel_option_path", "ge.graphParallelOptionPath"}, {"enable_graph_parallel", "ge.enableGraphParallel"}, {"atomic_clean_policy", "ge.exec.atomicCleanPolicy"}, - {"static_memory_policy", "ge.exec.staticMemoryPolicy"}}; + {"static_memory_policy", "ge.exec.staticMemoryPolicy"}, + {"_distribute.cm_chief_ip", ge::OPTION_EXEC_CM_CHIEF_IP}, + {"_distribute.cm_chief_port", ge::OPTION_EXEC_CM_CHIEF_PORT}, + {"_distribute.cm_chief_worker_device", ge::OPTION_EXEC_CM_CHIEF_DEVICE}, + {"_distribute.cm_worker_ip", ge::OPTION_EXEC_CM_WORKER_IP}, + {"_distribute.cm_worker_size", ge::OPTION_EXEC_CM_WORKER_SIZE}}; } // namespace #undef PYBIND11_CHECK_PYTHON_VERSION @@ -128,6 +133,13 @@ std::unordered_set npu_specify_ops_cache; constexpr uint32_t kDeviceSatModeLimit = 2U; } namespace npu { +bool CheckIsDistribute(std::map &global_options) { + return ((global_options.find(ge::OPTION_EXEC_RANK_TABLE_FILE) != global_options.end() && + global_options.find(ge::OPTION_EXEC_RANK_ID) != global_options.end()) || + (global_options.find(ge::OPTION_EXEC_CM_CHIEF_IP) != global_options.end() && + global_options.find(ge::OPTION_EXEC_CM_CHIEF_PORT) != global_options.end() && + global_options.find(ge::OPTION_EXEC_CM_CHIEF_DEVICE) != global_options.end())); +} void ParseGlobalOptions(int device_index, const std::map &user_options, std::map &global_options) { for (const auto &option : user_options) { @@ -138,8 +150,7 @@ void ParseGlobalOptions(int device_index, const std::map 1: + env_cm_chief_ip = os.getenv("CM_CHIEF_IP") env_rank_table = os.getenv("RANK_TABLE_FILE") - env_worker_id = os.getenv('RANK_ID') - if not env_rank_table: - raise RuntimeError('You must specify a rank table file by set env RANK_TABLE_FILE in distribution mode') - - if not env_worker_id: - raise RuntimeError('You must specify rank id by set env RANK_ID in distribution mode') - - global_kw_options['_distribute.rank_table'] = env_rank_table - global_kw_options['_distribute.rank_id'] = env_worker_id + if env_cm_chief_ip is not None and env_rank_table is not None: + raise RuntimeError('CM_CHIEF_IP and RANK_TABLE_FILE cannot be configured at the same time.') + elif env_cm_chief_ip is not None: + set_cm_chief_worksize_env(global_kw_options, env_cm_chief_ip, workers_num) + elif env_rank_table is not None: + set_rank_table_file_env(global_kw_options, env_rank_table) + else: + raise RuntimeError('CM_CHIEF_IP and RANK_TABLE_FILE are all not be configured.') device_options = {} error_message = _npu_device_backends.Open(context.context()._handle, NPU, device_id, global_kw_options, diff --git a/tf_adapter_2.x/tests/st/adapter2_st.py b/tf_adapter_2.x/tests/st/adapter2_st.py index 1b28a15cb..03663ac70 100644 --- a/tf_adapter_2.x/tests/st/adapter2_st.py +++ b/tf_adapter_2.x/tests/st/adapter2_st.py @@ -35,9 +35,11 @@ npu_device.global_options().experimental.multi_branches_config.dynamic_node_type npu_device.global_options().experimental.multi_branches_config.dynamic_dims = "1;2" npu_device.global_options().aoe_config.work_path = "./" npu_device.global_options().graph_run_mode = 0 -os.environ['RANK_TABLE_FILE'] = "rankTable" -os.environ['RANK_SIZE'] = "2" -os.environ['RANK_ID'] = "1" +os.environ['CM_CHIEF_IP'] = "1" +os.environ['CM_CHIEF_PORT'] = "3" +os.environ['CM_CHIEF_DEVICE'] = "4" +os.environ['CM_WORKER_SIZE'] = "2" +os.environ['CM_WORKER_IP'] = "123" npu = npu_device.open().as_default() npu.workers_num = 2 # mock run in 2P env diff --git a/tf_adapter_2.x/tests/stub/include/stub/defines.h b/tf_adapter_2.x/tests/stub/include/stub/defines.h index 5be2c1243..653de3c59 100644 --- a/tf_adapter_2.x/tests/stub/include/stub/defines.h +++ b/tf_adapter_2.x/tests/stub/include/stub/defines.h @@ -25,6 +25,13 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode"; const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile"; const char *const GE_AICPU_FLAG = "ge.aicpuFlag"; const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath"; + +const std::string OPTION_EXEC_CM_CHIEF_IP = "ge.cmChiefIp"; +const std::string OPTION_EXEC_CM_CHIEF_PORT = "ge.cmChiefPort"; +const std::string OPTION_EXEC_CM_CHIEF_DEVICE = "ge.cmChiefWorkerDevice"; +const std::string OPTION_EXEC_CM_WORKER_IP = "ge.cmWorkerIp"; +const std::string OPTION_EXEC_CM_WORKER_SIZE = "ge.cmWorkerSize"; + // Dump flag and para const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump"; const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath"; -- Gitee From 1219621bb8ec26c53f9d505f65855d8b6a89acc2 Mon Sep 17 00:00:00 2001 From: caiguangxing Date: Fri, 24 Mar 2023 09:50:52 +0000 Subject: [PATCH 09/22] =?UTF-8?q?!2147=20=E5=91=8A=E8=AD=A6=E6=B8=85?= =?UTF-8?q?=E7=90=86=E5=90=88=E5=85=A5C29=20Merge=20pull=20request=20!2147?= =?UTF-8?q?=20from=20caiguangxing/cherry-pick-1679473773?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/kernels/geop_npu.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index fc8538f00..fd5d3a630 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -844,7 +844,7 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { // convert to ge::graph if (graph_options_.count("input_format") != 0) { - ADP_LOG(INFO) << "graph_options_[\"input_format\"]: " << graph_options_["input_format"]; + ADP_LOG(INFO) << "graph_options_[\"input_format\"] = " << graph_options_["input_format"]; } ge::Graph ge_graph = ge::GraphUtilsEx::CreateGraphFromComputeGraph(compute_graph); if (iteration_per_loop_ > 1) { @@ -955,8 +955,8 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { }; // call ge session runGraphAsync api - ADP_LOG(INFO) << "[GEOP] Call ge session RunGraphAsync, kernel_name:" << geop_name << " ,tf session: " << tf_session_ - << " ,graph id: " << cache_graph_id; + ADP_LOG(INFO) << "[GEOP] Call ge session RunGraphAsync, kernel_name: " << geop_name << ", tf session: " << tf_session_ + << ", graph id: " << cache_graph_id; ge::Status run_graph_status = ge_session_->RunGraphAsync(cache_graph_id, inputs, callback); if (run_graph_status != ge::SUCCESS) { std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime)); @@ -974,7 +974,7 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { endTime = InferShapeUtil::GetCurrentTimestap(); ADP_LOG(INFO) << "[GEOP] End GeOp::ComputeAsync, kernel_name: " << geop_name << ", ret_status: " << ToString(run_graph_status) << ", tf session : " << tf_session_ - << " ,graph id: " << cache_graph_id << "[" << ((endTime - startTime) / kMicrosToMillis) << " ms]"; + << ", graph id: " << cache_graph_id << "[" << ((endTime - startTime) / kMicrosToMillis) << " ms]"; return; } @@ -1422,7 +1422,7 @@ void GeOp::BuildShapeNodeAndCacheArgNodes(Graph &graph) { if (node->name().find("IteratorGetNext_") != std::string::npos) { if (dynamic_node_type == "0") { dynamic_shape_nodes_.emplace_back(node); - ADP_LOG(INFO) << "push in dynamic shape nodes, node : " << node->name() << ", type : " << node->type_string(); + ADP_LOG(INFO) << "push in dynamic shape nodes, node : " << node->name() << ", type : " << node->type_string(); } } else { if (dynamic_node_type == "1") { -- Gitee From f46b950f9ea48dcfc9be785fab4b06f878209133 Mon Sep 17 00:00:00 2001 From: guopeian Date: Mon, 27 Mar 2023 02:56:15 +0000 Subject: [PATCH 10/22] =?UTF-8?q?!2159=20tensorflow2.x=E6=89=8B=E5=8A=A8?= =?UTF-8?q?=E9=94=80=E6=AF=81context=20Merge=20pull=20request=20!2159=20fr?= =?UTF-8?q?om=20guopeian/context=5Fc29?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter_2.x/npu_device/core/npu_global.cpp | 29 ++++++++++++++----- tf_adapter_2.x/npu_device/core/npu_global.h | 6 ++-- .../npu_device/core/npu_wrapper.cpp | 9 ++---- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/tf_adapter_2.x/npu_device/core/npu_global.cpp b/tf_adapter_2.x/npu_device/core/npu_global.cpp index a798941d8..cd365edd7 100644 --- a/tf_adapter_2.x/npu_device/core/npu_global.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_global.cpp @@ -39,25 +39,40 @@ std::unordered_set g_npu_specify_ops; tensorflow::mutex dev_memory_shared_lock; bool dev_memory_released = false; -void RtsCtx::SetGlobalCtx(aclrtContext global_ctx) { - static std::atomic_bool already_set{false}; - if (!already_set.exchange(true)) { - global_ctx_ = global_ctx; - global_ctx_set_ = true; +tensorflow::Status RtsCtx::CreateGlobalCtx(int32_t device_index) { + { + tensorflow::tf_shared_lock read_lock(global_ctx_mutex_); + if (global_ctx_ != nullptr) { + DLOG() << "Global context has been created."; + return tensorflow::Status::OK(); + } } + tensorflow::mutex_lock write_lock(global_ctx_mutex_); + NPU_REQUIRES_ACL_OK("Acl create rts ctx failed", aclrtCreateContext(&global_ctx_, device_index)); + return tensorflow::Status::OK(); } // 存在rtMalloc和rtFree在不同线程操作的情况,也存在同一线程会切换context的场景 // 这里保证全局唯一的ctx,且对device资源操作时都设置这个全局ctx tensorflow::Status RtsCtx::EnsureInitialized() { - if (global_ctx_set_) { + tensorflow::tf_shared_lock read_lock(global_ctx_mutex_); + if (global_ctx_ != nullptr) { NPU_REQUIRES_ACL_OK("Acl set current thread ctx failed", aclrtSetCurrentContext(global_ctx_)); } return tensorflow::Status::OK(); } +tensorflow::Status RtsCtx::DestroyGlobalCtx() { + tensorflow::mutex_lock write_lock(global_ctx_mutex_); + if (global_ctx_ != nullptr) { + NPU_REQUIRES_ACL_OK("Acl Destroy global ctx failed", aclrtDestroyContext(global_ctx_)); + } + global_ctx_ = nullptr; + return tensorflow::Status::OK(); +} + +tensorflow::mutex RtsCtx::global_ctx_mutex_; aclrtContext RtsCtx::global_ctx_{nullptr}; -std::atomic_bool RtsCtx::global_ctx_set_{false}; std::map NpuCtx::npu_ctx_; diff --git a/tf_adapter_2.x/npu_device/core/npu_global.h b/tf_adapter_2.x/npu_device/core/npu_global.h index 194badbc5..1b04e7666 100644 --- a/tf_adapter_2.x/npu_device/core/npu_global.h +++ b/tf_adapter_2.x/npu_device/core/npu_global.h @@ -41,12 +41,12 @@ extern bool dev_memory_released TF_GUARDED_BY(dev_memory_shared_lock); // Rts ctx管理器 class RtsCtx { public: - static void SetGlobalCtx(aclrtContext global_ctx); + static tensorflow::Status CreateGlobalCtx(int32_t device_index); static tensorflow::Status EnsureInitialized(); - + static tensorflow::Status DestroyGlobalCtx(); private: static aclrtContext global_ctx_; - static std::atomic_bool global_ctx_set_; + static tensorflow::mutex global_ctx_mutex_; }; class NpuCtx { diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index 675ee05e7..4a8dd0659 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -205,16 +205,10 @@ PYBIND11_MODULE(_npu_device_backends, m) { return status.error_message(); } } - - aclrtContext global_rt_ctx = nullptr; - auto status = [&global_rt_ctx, device_index]() -> tensorflow::Status { - NPU_REQUIRES_ACL_OK("Acl create rts ctx failed", aclrtCreateContext(&global_rt_ctx, device_index)); - return tensorflow::Status::OK(); - }(); + auto status = npu::global::RtsCtx::CreateGlobalCtx(device_index); if (!status.ok()) { return status.error_message(); } - npu::global::RtsCtx::SetGlobalCtx(global_rt_ctx); status = npu::global::RtsCtx::EnsureInitialized(); if (!status.ok()) { return status.error_message(); @@ -262,6 +256,7 @@ PYBIND11_MODULE(_npu_device_backends, m) { } (void)npu::NpuAoe::GetInstance().AoeTuningFinalize(); + (void)npu::global::RtsCtx::DestroyGlobalCtx(); } pybind11::gil_scoped_acquire acquire; }); -- Gitee From 013ab4da3bc06c51e43b043cd16679c151eb6f7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=8F=E5=90=8D=E9=A6=99?= Date: Tue, 28 Mar 2023 06:51:46 +0000 Subject: [PATCH 11/22] =?UTF-8?q?!2169=20=E4=BC=98=E5=8C=96dtstring?= =?UTF-8?q?=E7=B1=BB=E5=9E=8B=E6=80=A7=E8=83=BD=20Merge=20pull=20request?= =?UTF-8?q?=20!2169=20from=20=E6=99=8F=E5=90=8D=E9=A6=99/r1.12.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/kernels/aicpu/host_queue_dataset_op.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc index f661ffdc5..070404811 100644 --- a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc +++ b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc @@ -600,11 +600,12 @@ class HostQueueDatasetOp : public DatasetOpKernel { void RecordMbufQueueBytes(const bool is_hold, const uint64_t args_total_bytes) { if (!is_hold) { return; } mbuf_queue_rear_ = (mbuf_queue_rear_ + 1) % kStringTypeDepth; + mbuf_queue_total_bytes_ = mbuf_queue_total_bytes_ - mbuf_queue_bytes_[mbuf_queue_rear_] + args_total_bytes; mbuf_queue_bytes_[mbuf_queue_rear_] = args_total_bytes; } bool IsHoldDataTrans() { - if (!is_hold_type) { return false; } + if (mbuf_queue_total_bytes_ < static_cast(kMaxBytes)) { return false; } size_t mbuf_size; aclError status = acltdtQueryChannelSize(acl_handle_, &mbuf_size); if (status != ACL_SUCCESS) { @@ -1039,7 +1040,8 @@ class HostQueueDatasetOp : public DatasetOpKernel { double elapsed_time = 0; uint64_t total_bytes = 0; } data_thread_perf_stat_[static_cast(ThreadType::BUTT)]; - uint64_t mbuf_queue_bytes_[kStringTypeDepth]; + uint64_t mbuf_queue_bytes_[kStringTypeDepth] = { 0 }; + uint64_t mbuf_queue_total_bytes_ = 0; size_t mbuf_queue_rear_ = 0; }; const std::vector inputs_; -- Gitee From 462218842aa9a52683599f669521eec8b54e584f Mon Sep 17 00:00:00 2001 From: yaolun Date: Wed, 29 Mar 2023 00:55:34 +0000 Subject: [PATCH 12/22] =?UTF-8?q?!2172=20GetNext=E8=B6=85=E6=97=B6?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E4=B8=8A=E6=8A=A5=20Merge=20pull=20request?= =?UTF-8?q?=20!2172=20from=20yaolun/florence=5Fc29?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../depends/ascendcl/src/ascendcl_stub.cc | 8 ++++ .../depends/ascendcl/src/ascendcl_stub.h | 3 ++ .../dataset/host_queue_dats_set_st.cc | 46 +++++++++++++++++++ .../dataset/host_queue_dats_set_ut.cc | 46 +++++++++++++++++++ tf_adapter/util/acl_channel.cc | 5 ++ 5 files changed, 108 insertions(+) diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc index a53675814..8685e17d7 100644 --- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc +++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc @@ -234,9 +234,17 @@ aclError acltdtAddDataItem(acltdtDataset *dataset, acltdtDataItem *dataItem) { return ACL_SUCCESS; } +bool gAclTdtSendTensorMock = false; +void setAclTdtSendTensorMockStub(const bool isDriverSuccess) { + gAclTdtSendTensorMock = isDriverSuccess; +} + aclError acltdtSendTensor(const acltdtChannelHandle *handle, const acltdtDataset *dataset, int32_t timeout) { + if (gAclTdtSendTensorMock) { + return ACL_ERROR_DRV_FAILURE; + } if (dataset == nullptr || handle == nullptr) { return ACL_ERROR_INVALID_PARAM; } diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h index bb3f61969..a7df8034b 100644 --- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h +++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h @@ -36,6 +36,9 @@ void SetTensorDescSize(uint32_t val); extern bool g_loadModelStatus; void SetAclLoadModelFlag(bool load_status); +extern bool gAclTdtSendTensorMock; +void setAclTdtSendTensorMockStub(const bool isSuccess); + struct acltdtDataItem { acltdtDataItem(acltdtTensorType tdtType, const int64_t *dims, size_t dimNum, const std::string &dimsStr, diff --git a/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc b/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc index c501769d1..a7f704120 100644 --- a/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc +++ b/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc @@ -673,6 +673,52 @@ TEST_F(HostQueueDatasetOpTest, isholddatatrans3) { &end_of_sequence)); } +TEST_F(HostQueueDatasetOpTest, senddata_driver_error) { + setAclTdtSendTensorMockStub(true); + NpuAttrs::SetNewDataTransferFlag(true); + int thread_num = 2, cpu_num = 2; + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + const TestCase &test_case = NormalizeTestCase(); + Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({})); + std::vector inputs_for_tensor_slice_dataset = test_case.input_tensors; + TF_ASSERT_OK(CreateTensorSliceDatasetTensorForQueue(&inputs_for_tensor_slice_dataset, + &tensor_slice_dataset_tensor)); + + gtl::InlinedVector inputs_for_host_queue_dataset( + {TensorValue(&tensor_slice_dataset_tensor), + TensorValue(&tensor_slice_dataset_tensor)}); + + std::unique_ptr host_queue_dataset_kernel; + TF_ASSERT_OK(CreateHostQueueDatasetKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &host_queue_dataset_kernel, "-1")); + std::unique_ptr host_queue_dataset_context; + TF_ASSERT_OK(CreateHostQueueDatasetContext(host_queue_dataset_kernel.get(), + &inputs_for_host_queue_dataset, + &host_queue_dataset_context)); + DatasetBase *host_queue_dataset; + TF_ASSERT_OK(CreateDataset(host_queue_dataset_kernel.get(), + host_queue_dataset_context.get(), + &host_queue_dataset)); + core::ScopedUnref scoped_unref(host_queue_dataset); + + SerializationContext context(SerializationContext::Params{}); + GraphDefBuilder b; + DatasetBase::DatasetGraphDefBuilder db(&b); + Node *output; + host_queue_dataset->AsGraphDefInternal(&context, &db, &output); + + std::unique_ptr iterator_context; + TF_ASSERT_OK(CreateIteratorContext(host_queue_dataset_context.get(), + &iterator_context)); + std::unique_ptr iterator; + TF_ASSERT_OK(host_queue_dataset->MakeIterator(iterator_context.get(), + "Iterator", &iterator)); + sleep(2); + setAclTdtSendTensorMockStub(false); +} } // namespace } // namespace data } // namespace tensorflow diff --git a/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc b/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc index 1c92176d8..74600dd63 100644 --- a/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc +++ b/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc @@ -816,6 +816,52 @@ TEST_F(HostQueueDatasetOpTest, isholddatatrans3) { &end_of_sequence)); } +TEST_F(HostQueueDatasetOpTest, senddata_driver_error) { + setAclTdtSendTensorMockStub(true); + NpuAttrs::SetNewDataTransferFlag(true); + int thread_num = 2, cpu_num = 2; + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + const TestCase &test_case = NormalizeTestCase(); + Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({})); + std::vector inputs_for_tensor_slice_dataset = test_case.input_tensors; + TF_ASSERT_OK(CreateTensorSliceDatasetTensorForQueue(&inputs_for_tensor_slice_dataset, + &tensor_slice_dataset_tensor)); + + gtl::InlinedVector inputs_for_host_queue_dataset( + {TensorValue(&tensor_slice_dataset_tensor), + TensorValue(&tensor_slice_dataset_tensor)}); + + std::unique_ptr host_queue_dataset_kernel; + TF_ASSERT_OK(CreateHostQueueDatasetKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &host_queue_dataset_kernel, "-1")); + std::unique_ptr host_queue_dataset_context; + TF_ASSERT_OK(CreateHostQueueDatasetContext(host_queue_dataset_kernel.get(), + &inputs_for_host_queue_dataset, + &host_queue_dataset_context)); + DatasetBase *host_queue_dataset; + TF_ASSERT_OK(CreateDataset(host_queue_dataset_kernel.get(), + host_queue_dataset_context.get(), + &host_queue_dataset)); + core::ScopedUnref scoped_unref(host_queue_dataset); + + SerializationContext context(SerializationContext::Params{}); + GraphDefBuilder b; + DatasetBase::DatasetGraphDefBuilder db(&b); + Node *output; + host_queue_dataset->AsGraphDefInternal(&context, &db, &output); + + std::unique_ptr iterator_context; + TF_ASSERT_OK(CreateIteratorContext(host_queue_dataset_context.get(), + &iterator_context)); + std::unique_ptr iterator; + TF_ASSERT_OK(host_queue_dataset->MakeIterator(iterator_context.get(), + "Iterator", &iterator)); + sleep(2); + setAclTdtSendTensorMockStub(false); +} } // namespace } // namespace data } // namespace tensorflow diff --git a/tf_adapter/util/acl_channel.cc b/tf_adapter/util/acl_channel.cc index 193d03517..dd42b9b1b 100644 --- a/tf_adapter/util/acl_channel.cc +++ b/tf_adapter/util/acl_channel.cc @@ -22,6 +22,7 @@ #include "tf_adapter/common/compat_tf1_tf2.h" #include "tf_adapter/util/npu_attrs.h" #include "tf_adapter/util/util.h" +#include "ge/ge_api.h" namespace tensorflow { Status MappingTfDtypeToAcl(const tensorflow::DataType tf_type, aclDataType &acl_type) { const static std::map type_mapping = { @@ -251,6 +252,10 @@ Status SendTensorsByAcl(const acltdtChannelHandle *acl_handle, acltdtTensorType return Status::OK(); } if (acl_status != ACL_ERROR_NONE) { + std::string error_message = ge::GEGetErrorMsg(); + LOG(ERROR) << "Failed to send data by acl, error code : "<< acl_status << std::endl + << "Error Message is " << std::endl + << error_message; return errors::Internal("Acl send data failed, acl status:", acl_status); } return Status::OK(); -- Gitee From cae7047df9667551fc684b4bba91959861385cce Mon Sep 17 00:00:00 2001 From: huanruizhi Date: Thu, 30 Mar 2023 06:08:14 +0000 Subject: [PATCH 13/22] =?UTF-8?q?!2176=20=E9=9D=99=E6=80=81=E6=88=90?= =?UTF-8?q?=E5=91=98=E5=8A=A0=E9=94=81=E5=90=8C=E6=AD=A5=E5=88=B0=E5=88=86?= =?UTF-8?q?=E6=94=AF=20Merge=20pull=20request=20!2176=20from=20huanruizhi/?= =?UTF-8?q?r1.12=5Fdev?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/util/npu_attrs.cc | 182 ++++++++++++++++++----------------- tf_adapter/util/npu_attrs.h | 2 + 2 files changed, 95 insertions(+), 89 deletions(-) diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index 689a406b8..caf83d9aa 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -15,7 +15,6 @@ */ #include "tf_adapter/util/npu_attrs.h" -#include #include #include #include @@ -43,6 +42,7 @@ std::map NpuAttrs::turn_on_tdt_info_; std::map NpuAttrs::use_adp_info_; std::map NpuAttrs::dataset_execute_info_; std::map NpuAttrs::init_options_; +std::mutex NpuAttrs::mutex_; const static int32_t kRuntimeTypeHeterogeneous = 1; bool NpuAttrs::CheckIsNewDataTransfer() { @@ -624,6 +624,7 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr (void) ctx->GetAttr("_event_sync_timeout", &event_sync_timeout); } + std::lock_guard lock(mutex_); if (precision_mode.empty()) { init_options_[ge::PRECISION_MODE] = "allow_fp32_to_fp16"; } else { @@ -677,6 +678,7 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr } std::map NpuAttrs::GetInitOptions() { + std::lock_guard lock(mutex_); return init_options_; } @@ -1554,6 +1556,11 @@ std::map NpuAttrs::GetDefaultPassOptions() { } Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options, Node *node) { + if (!node) { + ADP_LOG(ERROR) << "node is null."; + LOG(ERROR) << "node is null."; + return errors::Internal("node is null."); + } std::map sess_options; bool hcom_parallel = true; std::string graph_memory_max_size; @@ -1639,9 +1646,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options bool external_weight = false; bool frozen_variable = false; std::string variable_location = "Device"; + int64_t op_debug_level = 0; - const RewriterConfig &rewrite_options = - options.session_options->config.graph_options().rewrite_options(); + const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options(); for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) { if (custom_optimizer.name() == "NpuOptimizer") { const auto ¶ms = custom_optimizer.parameter_map(); @@ -1736,9 +1743,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options } } if (params.count("op_debug_level") > 0) { - int64_t op_debug_level = params.at("op_debug_level").i(); - init_options_["op_debug_level"] = std::to_string(op_debug_level); - init_options_[ge::OP_DEBUG_LEVEL] = std::to_string(op_debug_level); + op_debug_level = params.at("op_debug_level").i(); LOG_DEPRECATED_WITH_REPLACEMENT(op_debug_level, op_debug_config); } if (params.count("enable_scope_fusion_passes") > 0) { @@ -2120,78 +2125,87 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options sess_options["external_weight"] = std::to_string(static_cast(external_weight)); sess_options["ge.externalWeight"] = std::to_string(static_cast(external_weight)); - init_options_["precision_mode"] = precision_mode; - if (precision_mode.empty()) { - init_options_[ge::PRECISION_MODE] = "allow_fp32_to_fp16"; - } else { - init_options_[ge::PRECISION_MODE] = precision_mode; - } - init_options_["profiling_mode"] = std::to_string(static_cast(profiling_mode)); - init_options_[ge::OPTION_EXEC_PROFILING_MODE] = std::to_string(static_cast(profiling_mode)); - init_options_["profiling_options"] = profiling_options; - init_options_[ge::OPTION_EXEC_PROFILING_OPTIONS] = profiling_options; - init_options_["ge.autoTuneMode"] = auto_tune_mode; - init_options_["graph_run_mode"] = std::to_string(graph_run_mode); - init_options_[ge::OPTION_GRAPH_RUN_MODE] = std::to_string(graph_run_mode); - init_options_["enable_scope_fusion_passes"] = enable_scope_fusion_passes; - init_options_[ge::OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES] = enable_scope_fusion_passes; - init_options_["enable_exception_dump"] = std::to_string(enable_exception_dump); - init_options_["ge.exec.enable_exception_dump"] = std::to_string(enable_exception_dump); - init_options_["ge.deterministic"] = std::to_string(deterministic); - init_options_["aoe_mode"] = aoe_mode; - init_options_["ge.jobType"] = aoe_mode; - init_options_["work_path"] = work_path; - init_options_["ge.tuningPath"] = work_path; - init_options_["distribute_config"] = distribute_config; - init_options_["op_compiler_cache_mode"] = op_compiler_cache_mode; - init_options_["ge.op_compiler_cache_mode"] = op_compiler_cache_mode; - init_options_["op_compiler_cache_dir"] = op_compiler_cache_dir; - init_options_["ge.op_compiler_cache_dir"] = op_compiler_cache_dir; - init_options_["debug_dir"] = debug_dir; - init_options_["ge.debugDir"] = debug_dir; - init_options_["device_type"] = device_type; - init_options_["ge.deviceType"] = device_type; - init_options_["soc_config"] = soc_config; - if (!soc_config.empty()) { - init_options_["ge.socVersion"] = soc_config; + { + std::lock_guard lock(mutex_); + init_options_["precision_mode"] = precision_mode; + if (precision_mode.empty()) { + init_options_[ge::PRECISION_MODE] = "allow_fp32_to_fp16"; + } else { + init_options_[ge::PRECISION_MODE] = precision_mode; + } + init_options_["op_debug_level"] = std::to_string(op_debug_level); + init_options_[ge::OP_DEBUG_LEVEL] = std::to_string(op_debug_level); + init_options_["profiling_mode"] = std::to_string(static_cast(profiling_mode)); + init_options_[ge::OPTION_EXEC_PROFILING_MODE] = std::to_string(static_cast(profiling_mode)); + init_options_["profiling_options"] = profiling_options; + init_options_[ge::OPTION_EXEC_PROFILING_OPTIONS] = profiling_options; + init_options_["ge.autoTuneMode"] = auto_tune_mode; + init_options_["graph_run_mode"] = std::to_string(graph_run_mode); + init_options_[ge::OPTION_GRAPH_RUN_MODE] = std::to_string(graph_run_mode); + init_options_["enable_scope_fusion_passes"] = enable_scope_fusion_passes; + init_options_[ge::OPTION_EXEC_ENABLE_SCOPE_FUSION_PASSES] = enable_scope_fusion_passes; + init_options_["enable_exception_dump"] = std::to_string(enable_exception_dump); + init_options_["ge.exec.enable_exception_dump"] = std::to_string(enable_exception_dump); + init_options_["ge.deterministic"] = std::to_string(deterministic); + init_options_["aoe_mode"] = aoe_mode; + init_options_["ge.jobType"] = aoe_mode; + init_options_["work_path"] = work_path; + init_options_["ge.tuningPath"] = work_path; + init_options_["distribute_config"] = distribute_config; + init_options_["op_compiler_cache_mode"] = op_compiler_cache_mode; + init_options_["ge.op_compiler_cache_mode"] = op_compiler_cache_mode; + init_options_["op_compiler_cache_dir"] = op_compiler_cache_dir; + init_options_["ge.op_compiler_cache_dir"] = op_compiler_cache_dir; + init_options_["debug_dir"] = debug_dir; + init_options_["ge.debugDir"] = debug_dir; + init_options_["device_type"] = device_type; + init_options_["ge.deviceType"] = device_type; + init_options_["soc_config"] = soc_config; + if (!soc_config.empty()) { + init_options_["ge.socVersion"] = soc_config; + } + init_options_["op_wait_timeout"] = op_wait_timeout; + init_options_["ge.exec.opWaitTimeout"] = op_wait_timeout; + init_options_["op_execute_timeout"] = op_execute_timeout; + init_options_["ge.exec.opExecuteTimeout"] = op_execute_timeout; + init_options_["customize_dtypes"] = customize_dtypes; + init_options_["ge.customizeDtypes"] = customize_dtypes; + init_options_["op_debug_config"] = op_debug_config; + init_options_["ge.exec.opDebugConfig"] = op_debug_config; + init_options_["static_memory_policy"] = static_memory_policy; + // Commercial version has been released, temporarily used + init_options_["GE_USE_STATIC_MEMORY"] = static_memory_policy; + init_options_["ge.exec.staticMemoryPolicy"] = static_memory_policy; + + init_options_["ge.hcomMultiMode"] = std::to_string(hcom_multi_mode); + init_options_[ge::MODIFY_MIXLIST] = modify_mixlist; + init_options_["ge.fusionSwitchFile"] = fusion_switch_file; + init_options_[ge::OP_PRECISION_MODE] = op_precision_mode; + init_options_[ge::OP_SELECT_IMPL_MODE] = op_select_implmode; + init_options_[ge::OPTYPELIST_FOR_IMPLMODE] = optypelist_for_implmode; + init_options_["ge.exec.hcclExecuteTimeOut"] = hccl_timeout; + init_options_["HCCL_algorithm"] = HCCL_algorithm; + init_options_["graph_exec_timeout"] = std::to_string(graph_exec_timeout); + init_options_["ge.exec.graphExecTimeout"] = std::to_string(graph_exec_timeout); + init_options_["logical_device_cluster_deploy_mode"] = logical_device_cluster_deploy_mode; + init_options_["ge.exec.logicalDeviceClusterDeployMode"] = logical_device_cluster_deploy_mode; + init_options_["logical_device_id"] = logical_device_id; + init_options_["ge.exec.logicalDeviceId"] = logical_device_id; + init_options_["model_deploy_mode"] = model_deploy_mode; + init_options_["ge.exec.modelDeployMode"] = model_deploy_mode; + init_options_["model_deploy_devicelist"] = model_deploy_devicelist; + init_options_["ge.exec.modelDeployDevicelist"] = model_deploy_devicelist; + init_options_["dump_data"] = dump_data; + init_options_["ge.exec.dumpData"] = dump_data; + init_options_["aoe_config_file"] = aoe_config_file; + init_options_["ge.aoe_config_file"] = aoe_config_file; + init_options_["stream_sync_timeout"] = std::to_string(stream_sync_timeout); + init_options_["event_sync_timeout"] = std::to_string(event_sync_timeout); + for (const auto &option : init_options_) { + std::string attr_name = std::string("_") + option.first; + node->AddAttr(attr_name, option.second); + } } - init_options_["op_wait_timeout"] = op_wait_timeout; - init_options_["ge.exec.opWaitTimeout"] = op_wait_timeout; - init_options_["op_execute_timeout"] = op_execute_timeout; - init_options_["ge.exec.opExecuteTimeout"] = op_execute_timeout; - init_options_["customize_dtypes"] = customize_dtypes; - init_options_["ge.customizeDtypes"] = customize_dtypes; - init_options_["op_debug_config"] = op_debug_config; - init_options_["ge.exec.opDebugConfig"] = op_debug_config; - init_options_["static_memory_policy"] = static_memory_policy; - // Commercial version has been released, temporarily used - init_options_["GE_USE_STATIC_MEMORY"] = static_memory_policy; - init_options_["ge.exec.staticMemoryPolicy"] = static_memory_policy; - - init_options_["ge.hcomMultiMode"] = std::to_string(hcom_multi_mode); - init_options_[ge::MODIFY_MIXLIST] = modify_mixlist; - init_options_["ge.fusionSwitchFile"] = fusion_switch_file; - init_options_[ge::OP_PRECISION_MODE] = op_precision_mode; - init_options_[ge::OP_SELECT_IMPL_MODE] = op_select_implmode; - init_options_[ge::OPTYPELIST_FOR_IMPLMODE] = optypelist_for_implmode; - init_options_["ge.exec.hcclExecuteTimeOut"] = hccl_timeout; - init_options_["HCCL_algorithm"] = HCCL_algorithm; - init_options_["graph_exec_timeout"] = std::to_string(graph_exec_timeout); - init_options_["ge.exec.graphExecTimeout"] = std::to_string(graph_exec_timeout); - init_options_["logical_device_cluster_deploy_mode"] = logical_device_cluster_deploy_mode; - init_options_["ge.exec.logicalDeviceClusterDeployMode"] = logical_device_cluster_deploy_mode; - init_options_["logical_device_id"] = logical_device_id; - init_options_["ge.exec.logicalDeviceId"] = logical_device_id; - init_options_["model_deploy_mode"] = model_deploy_mode; - init_options_["ge.exec.modelDeployMode"] = model_deploy_mode; - init_options_["model_deploy_devicelist"] = model_deploy_devicelist; - init_options_["ge.exec.modelDeployDevicelist"] = model_deploy_devicelist; - init_options_["dump_data"] = dump_data; - init_options_["ge.exec.dumpData"] = dump_data; - init_options_["aoe_config_file"] = aoe_config_file; - init_options_["ge.aoe_config_file"] = aoe_config_file; - init_options_["stream_sync_timeout"] = std::to_string(stream_sync_timeout); - init_options_["event_sync_timeout"] = std::to_string(event_sync_timeout); pass_options["do_npu_optimizer"] = std::to_string(static_cast(do_npu_optimizer)); pass_options["enable_data_pre_proc"] = std::to_string(static_cast(enable_dp)); @@ -2211,22 +2225,12 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options pass_options["frozen_variable"] = std::to_string(static_cast(frozen_variable)); pass_options["variable_location"] = variable_location; - if (!node) { - ADP_LOG(ERROR) << "node is null."; - LOG(ERROR) << "node is null."; - return errors::Internal("node is null."); - } - std::string attr_name; for (const auto &option : sess_options) { - attr_name = std::string("_") + option.first; - node->AddAttr(attr_name, option.second); - } - for (const auto &option : init_options_) { - attr_name = std::string("_") + option.first; + std::string attr_name = std::string("_") + option.first; node->AddAttr(attr_name, option.second); } for (const auto &option : pass_options) { - attr_name = std::string("_") + option.first; + std::string attr_name = std::string("_") + option.first; node->AddAttr(attr_name, option.second); } node->AddAttr("_NpuOptimizer", "NpuOptimizer"); diff --git a/tf_adapter/util/npu_attrs.h b/tf_adapter/util/npu_attrs.h index 7e73c693a..2f0def1ec 100644 --- a/tf_adapter/util/npu_attrs.h +++ b/tf_adapter/util/npu_attrs.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "ge/ge_api_types.h" #include "tensorflow/core/common_runtime/optimization_registry.h" @@ -93,6 +94,7 @@ class NpuAttrs { static std::map use_adp_info_; static std::map dataset_execute_info_; static std::map init_options_; + static std::mutex mutex_; }; } // namespace tensorflow -- Gitee From 023a9c15bd46ebbbd04dcb6e449e593f5259d7f2 Mon Sep 17 00:00:00 2001 From: xujiuxu Date: Thu, 30 Mar 2023 06:50:19 +0000 Subject: [PATCH 14/22] =?UTF-8?q?!2178=20=E5=91=8A=E8=AD=A6=E6=B8=85?= =?UTF-8?q?=E7=90=86=20Merge=20pull=20request=20!2178=20from=20xujiuxu/che?= =?UTF-8?q?rry-pick-1680148588?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/optimizers/frozen_variable_pass.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tf_adapter/optimizers/frozen_variable_pass.cc b/tf_adapter/optimizers/frozen_variable_pass.cc index 02582727e..951f66ab3 100644 --- a/tf_adapter/optimizers/frozen_variable_pass.cc +++ b/tf_adapter/optimizers/frozen_variable_pass.cc @@ -49,10 +49,10 @@ class FrozenVariablePass : public GraphOptimizationPass { private: bool IsAllOutputsIdentity(const Node * const node) const; bool IsAllOutputsReadOp(const Node * const node) const; - bool IsNeedBuildPartitionedCall(const Node * const node); + bool IsNeedBuildPartitionedCall(const Node * const node) const; std::map GetGraphConfigs(const Graph &graph) const; void RemoveDeadNodes(Graph* g) const; - Status DoConstantFolding(const GraphOptimizationPassOptions &options, const uint64_t index); + Status DoConstantFolding(const GraphOptimizationPassOptions &options, const uint64_t index) const; }; struct StableNodeCompartor { @@ -79,7 +79,7 @@ bool FrozenVariablePass::IsAllOutputsReadOp(const Node * const node) const { return true; } -bool FrozenVariablePass::IsNeedBuildPartitionedCall(const Node * const node) { +bool FrozenVariablePass::IsNeedBuildPartitionedCall(const Node * const node) const { return ((node->type_string() == "Variable" || node->type_string() == "VariableV2") && IsAllOutputsIdentity(node)) || (node->type_string() == "VarHandleOp" && IsAllOutputsReadOp(node)); } @@ -105,7 +105,7 @@ void FrozenVariablePass::RemoveDeadNodes(Graph* g) const { } Status FrozenVariablePass::DoConstantFolding(const GraphOptimizationPassOptions &options, - const uint64_t index) { + const uint64_t index) const { ADP_LOG(INFO) << "Before do const folding " << options.session_options->config.DebugString(); if (options.device_set == nullptr) { return errors::Internal("Failed to get device set to run constant folding"); -- Gitee From bd4c42c4f0e17358ba5ebf047f31be6a5281e65a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=B6=9B?= Date: Mon, 3 Apr 2023 06:58:13 +0000 Subject: [PATCH 15/22] =?UTF-8?q?!2186=20update=20owners=20Merge=20pull=20?= =?UTF-8?q?request=20!2186=20from=20=E7=8E=8B=E6=B6=9B/r1.12.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- OWNERS | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/OWNERS b/OWNERS index c122c07b1..850ab2fd0 100644 --- a/OWNERS +++ b/OWNERS @@ -1,14 +1,7 @@ approvers: -- wqtshg -- ji_chen +- startzgf168 +- andylhy - zhangfan_hq -- lipeiyang3699 -- wangtao43 -- changhaixun -- z00332957 -- wangxiaotian22 -- xiexianhu -- xiaozhedeng reviewers: - xchu42 - sheng-nan -- Gitee From 1a9d266f15b0fb0d31c768b38049c0fb11050027 Mon Sep 17 00:00:00 2001 From: yaolun Date: Tue, 4 Apr 2023 09:10:53 +0000 Subject: [PATCH 16/22] =?UTF-8?q?!2191=20[=E8=B4=A8=E9=87=8F=E6=8F=90?= =?UTF-8?q?=E5=8D=87]=E5=AE=89=E5=85=A8=E5=87=BD=E6=95=B0=E6=95=B4?= =?UTF-8?q?=E6=94=B9=20Merge=20pull=20request=20!2191=20from=20yaolun/flor?= =?UTF-8?q?ence=5Fc29?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/util/host_queue.cc | 23 +++++++++++++---- tf_adapter/util/util.cc | 26 +++++++++++++++++--- tf_adapter/util/util.h | 2 ++ tf_adapter_2.x/npu_device/core/npu_hdc.cpp | 18 ++++++++------ tf_adapter_2.x/npu_device/core/npu_utils.cpp | 9 ++++--- tf_adapter_2.x/npu_device/core/npu_utils.h | 2 +- 6 files changed, 61 insertions(+), 19 deletions(-) diff --git a/tf_adapter/util/host_queue.cc b/tf_adapter/util/host_queue.cc index f0fb9b833..7e082fc2a 100644 --- a/tf_adapter/util/host_queue.cc +++ b/tf_adapter/util/host_queue.cc @@ -216,19 +216,32 @@ Status SerializeDataItemInfo(std::vector &items, void *&buff, cons } size_t offset = 0UL; for (size_t i = 0UL; i < cnt; ++i) { - // can not use memcpy_s here, data size may over 2G - // total_size is calculate by item info, could not overflow here - (void)memcpy(ge::ValueToPtr(ge::PtrToValue(data) + offset), &items[i].ctrl_info, sizeof(ItemInfo)); + auto ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), sizeof(ItemInfo), + &items[i].ctrl_info, sizeof(ItemInfo)); + if (ret != EOK) { + (void)rtMbufFree(buff); + return errors::Internal("Copy item info failed, ret=", ret); + } offset += sizeof(ItemInfo); for (size_t j = 0UL; j < items[i].ctrl_info.dim_num; ++j) { - (void)memcpy(ge::ValueToPtr(ge::PtrToValue(data) + offset), &(items[i].dims[j]), sizeof(int64_t)); + ret = memcpy_s(ge::ValueToPtr(ge::PtrToValue(data) + offset), sizeof(int64_t), + &(items[i].dims[j]), sizeof(int64_t)); + if (ret != EOK) { + (void)rtMbufFree(buff); + return errors::Internal("Copy dim info failed, ret=", ret); + } offset += sizeof(int64_t); } if (items[i].ctrl_info.data_len == 0UL) { continue; } - (void)memcpy(ge::ValueToPtr(ge::PtrToValue(data) + offset), items[i].data_ptr, items[i].ctrl_info.data_len); + auto status = LoopCopy(static_cast(ge::ValueToPtr(ge::PtrToValue(data) + offset)), (total_size - offset), + static_cast(items[i].data_ptr), items[i].ctrl_info.data_len); + if (!status.ok()) { + (void)rtMbufFree(buff); + return status; + } offset += items[i].ctrl_info.data_len; } diff --git a/tf_adapter/util/util.cc b/tf_adapter/util/util.cc index 8cb0b1fc3..4fcc26321 100644 --- a/tf_adapter/util/util.cc +++ b/tf_adapter/util/util.cc @@ -46,9 +46,9 @@ Status GetDtStringTensorData(const Tensor &tensor, uint8_t *&data_ptr, uint64_t ge::StringHead *head = ge::PtrToPtr(base_ptr + i * sizeof(ge::StringHead)); head->addr = offset; head->len = tensor.flat()(i).size(); - // can not use memcpy_s here, data size may over 2G - // total_size is calculate by item info, could not overflow here - (void)memcpy(base_ptr + offset, tensor.flat()(i).data(), head->len); + auto status = LoopCopy(ge::PtrToPtr(base_ptr + offset), (buff_size - offset), + const_cast(tensor.flat()(i).data()), head->len); + if (!status.ok()) { return status; } offset += head->len; } data_ptr = buff_list.back().get(); @@ -97,6 +97,26 @@ Status MappingDtStringTensor2AclDataItem(const Tensor &tensor, acltdtDataItem *& return Status::OK(); } +Status LoopCopy(char *dst_ptr, size_t dst_size, char *src_ptr, size_t src_size) { + if (dst_size < src_size) { + return tensorflow::errors::Internal("Loop memory copy failed. dst_size:", dst_size, ", src_size:", src_size); + } + size_t copy_size = 0UL; + size_t org_src_size = src_size; + do { + size_t src_copy_size = (src_size > SECUREC_MEM_MAX_LEN) ? SECUREC_MEM_MAX_LEN : src_size; + if (memcpy_s(dst_ptr, src_copy_size, src_ptr, src_copy_size) != EOK) { + return tensorflow::errors::Internal("Loop memory copy failed , dst_size:", src_copy_size, + ", src_size:", src_copy_size); + } + copy_size += src_copy_size; + dst_ptr += src_copy_size; + src_ptr += src_copy_size; + src_size -= src_copy_size; + } while (copy_size < org_src_size); + return tensorflow::Status::OK(); +} + bool IsWithoutNpuScope(const NodeDef &node_def) { if (node_def.attr().count(ATTR_VALUE_SCOPE_NAME) > 0) { return node_def.attr().at(ATTR_VALUE_SCOPE_NAME).b(); } return false; diff --git a/tf_adapter/util/util.h b/tf_adapter/util/util.h index e24910144..0390d1ad1 100644 --- a/tf_adapter/util/util.h +++ b/tf_adapter/util/util.h @@ -33,6 +33,8 @@ Status MappingDTStringTensor2DataItem(const Tensor &tensor, tdt::DataItem &item, Status MappingDtStringTensor2AclDataItem(const Tensor &tensor, acltdtDataItem *&acl_data, std::vector> &buff_list); +Status LoopCopy(char *dst_ptr, size_t dst_size, char *src_ptr, size_t src_size); + bool IsWithoutNpuScope(const NodeDef &node_def); bool IsWithoutNpuScope(const Node *node); bool IsVariableOrResourceVariable(const Node * const node); diff --git a/tf_adapter_2.x/npu_device/core/npu_hdc.cpp b/tf_adapter_2.x/npu_device/core/npu_hdc.cpp index 3aaf5f38b..4c321ecbe 100644 --- a/tf_adapter_2.x/npu_device/core/npu_hdc.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_hdc.cpp @@ -26,7 +26,7 @@ constexpr size_t kParallelMemCopyThreshold = 10 * 1024 * 1024UL; /** * @brief: parallel mem copy */ -tensorflow::Status Copy2ContinuousMem(void *dst_ptr, void *src_ptr, const size_t src_size) { +tensorflow::Status Copy2ContinuousMem(void *dst_ptr, const size_t dst_size, void *src_ptr, const size_t src_size) { if (dst_ptr == nullptr || src_ptr == nullptr) { return tensorflow::errors::Internal("dst_ptr or src_ptr is null before do parallel memory copy."); } @@ -44,19 +44,21 @@ tensorflow::Status Copy2ContinuousMem(void *dst_ptr, void *src_ptr, const size_t size_t block_size = src_size / npu::kDefaultThreadNum; size_t remained_size = src_size % npu::kDefaultThreadNum; std::vector copy_results(npu::kDefaultThreadNum); + size_t dst_remain_size = dst_size; for (size_t i = 0UL; i < npu::kDefaultThreadNum; i++) { if (i == npu::kDefaultThreadNum - 1U) { block_size += remained_size; } auto &ret = copy_results[i]; - std::function closure = [dst_ptr, src_ptr, block_size, &ret, ¶llel_cpy_count]() { - ret = npu::LoopCopy(static_cast(dst_ptr), static_cast(src_ptr), block_size); + std::function closure = [dst_ptr, dst_remain_size, src_ptr, block_size, &ret, ¶llel_cpy_count]() { + ret = npu::LoopCopy(static_cast(dst_ptr), dst_remain_size, static_cast(src_ptr), block_size); ++parallel_cpy_count; }; NPU_REQUIRES_OK(npu::NpuThreadPool::GetInstance().EnqueueTask(closure)); enqueue_count++; dst_ptr = reinterpret_cast(reinterpret_cast(dst_ptr) + block_size); src_ptr = reinterpret_cast(reinterpret_cast(src_ptr) + block_size); + dst_remain_size -= block_size; } while (parallel_cpy_count < enqueue_count) { } @@ -135,13 +137,14 @@ tensorflow::Status HdcChannel::AssembleAclTensor2Tensor(const acltdtDataItem *it tf_shape.AddDim(dim); } tensorflow::Tensor tensor = tensorflow::Tensor(tf_type, tf_shape); - auto tensor_data = tensor.data(); auto tensor_size = tensor.tensor_data().size(); if (tensor_size != acl_data_len) { return tensorflow::errors::Internal("Hdc channel receive size mismatch tensor size acl:", acl_data_len, " vs. tensorflow:", tensor_size); } - (void)memcpy(tensor_data, acl_data, tensor_size); + auto status = LoopCopy(static_cast(tensor.data()), tensor_size, + const_cast(acl_data), tensor_size); + if (!status.ok()) { return status; } tensors.emplace_back(std::move(tensor)); } else { return tensorflow::errors::InvalidArgument("Hdc channel receive un-copyable tensorflow data type", @@ -260,7 +263,8 @@ tensorflow::Status HdcChannel::AssembleTensors2AclDataset(acltdtTensorType acl_t for (auto &tensor : tensors) { total_size += tensor.TotalBytes(); } - tensors_buffer_.resize(std::max(tensors_buffer_.size(), total_size)); + size_t dst_size = std::max(tensors_buffer_.size(), total_size); + tensors_buffer_.resize(dst_size); bool npu_alloc = NpuAllocatorUtils::IsNpuAllocator(tensors[0]); @@ -275,7 +279,7 @@ tensorflow::Status HdcChannel::AssembleTensors2AclDataset(acltdtTensorType acl_t if (IsNeedContinuousMem() && !npu_alloc) { size_t src_size = tensor.TotalBytes(); tensor_data = tensors_buffer_.data() + offset; - NPU_REQUIRES_OK(Copy2ContinuousMem(tensor_data, tensor.data(), src_size)); + NPU_REQUIRES_OK(Copy2ContinuousMem(tensor_data, (dst_size - offset), tensor.data(), src_size)); offset += src_size; } acl_data = acltdtCreateDataItem(ACL_TENSOR_DATA_TENSOR, diff --git a/tf_adapter_2.x/npu_device/core/npu_utils.cpp b/tf_adapter_2.x/npu_device/core/npu_utils.cpp index 7b73b7ae3..37834619e 100644 --- a/tf_adapter_2.x/npu_device/core/npu_utils.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_utils.cpp @@ -538,14 +538,17 @@ tensorflow::Status SeparateGraphDef(tensorflow::GraphDef *def, return tensorflow::Status::OK(); } -tensorflow::Status LoopCopy(char *dst_ptr, char *src_ptr, size_t src_size) { +tensorflow::Status LoopCopy(char *dst_ptr, size_t dst_size, char *src_ptr, size_t src_size) { + NPU_REQUIRES((dst_size >= src_size), + tensorflow::errors::Internal("Loop memory copy failed. dst_size:", dst_size, ", src_size:", src_size)); + size_t copy_size = 0UL; size_t org_src_size = src_size; do { size_t src_copy_size = (src_size > SECUREC_MEM_MAX_LEN) ? SECUREC_MEM_MAX_LEN : src_size; if (memcpy_s(dst_ptr, src_copy_size, src_ptr, src_copy_size) != EOK) { - return tensorflow::errors::Internal("loop memory copy failed , dst:", dst_ptr, ", dst_size:", src_copy_size, - ", src:", src_ptr, ", src_size:", src_copy_size); + return tensorflow::errors::Internal("loop memory copy failed , dst_size:", src_copy_size, + ", src_size:", src_copy_size); } copy_size += src_copy_size; dst_ptr += src_copy_size; diff --git a/tf_adapter_2.x/npu_device/core/npu_utils.h b/tf_adapter_2.x/npu_device/core/npu_utils.h index 57338d4dc..41945620c 100644 --- a/tf_adapter_2.x/npu_device/core/npu_utils.h +++ b/tf_adapter_2.x/npu_device/core/npu_utils.h @@ -157,7 +157,7 @@ class OptimizeStageGraphDumper { void NpuCustomizedOptimizeGraph(tensorflow::FunctionLibraryRuntime &lib, std::unique_ptr *g); -tensorflow::Status LoopCopy(char *dst_ptr, char *src_ptr, size_t src_size); +tensorflow::Status LoopCopy(char *dst_ptr, size_t dst_size, char *src_ptr, size_t src_size); int64_t CreateChannelCapacity(const npu::TensorPartialShapes &shapes, const npu::TensorDataTypes &types); -- Gitee From f058a9243afd9a9af8fe1472d658f464b7755596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=B7=E6=AC=A2?= Date: Tue, 4 Apr 2023 12:27:56 +0000 Subject: [PATCH 17/22] =?UTF-8?q?!2194=20=E9=A2=84=E5=A4=84=E7=90=86H2D?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=8F=91=E9=80=81=E5=A4=B1=E8=B4=A5=E6=97=B6?= =?UTF-8?q?=E7=BB=93=E6=9D=9F=E8=AE=AD=E7=BB=83=20Merge=20pull=20request?= =?UTF-8?q?=20!2194=20from=20=E9=9B=B7=E6=AC=A2/r1.12.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- inc/external/acl/acl_base.h | 1 + .../tests/depends/ascendcl/src/ascendcl_stub.cc | 8 ++++---- .../tests/depends/ascendcl/src/ascendcl_stub.h | 2 +- ...queue_dats_set_st.cc => host_queue_dataset_st.cc} | 0 ...queue_dats_set_ut.cc => host_queue_dataset_ut.cc} | 0 tf_adapter/util/acl_channel.cc | 12 ++++++------ 6 files changed, 12 insertions(+), 11 deletions(-) rename tf_adapter/tests/st/kernels/testcase/dataset/{host_queue_dats_set_st.cc => host_queue_dataset_st.cc} (100%) rename tf_adapter/tests/ut/kernels/testcase/dataset/{host_queue_dats_set_ut.cc => host_queue_dataset_ut.cc} (100%) diff --git a/inc/external/acl/acl_base.h b/inc/external/acl/acl_base.h index 8f4da06ec..7a5be38c5 100644 --- a/inc/external/acl/acl_base.h +++ b/inc/external/acl/acl_base.h @@ -126,6 +126,7 @@ static const int ACL_ERROR_RT_FAILURE = 500003; static const int ACL_ERROR_DRV_FAILURE = 500004; static const int ACL_ERROR_PROFILING_FAILURE = 500005; + #define ACL_TENSOR_SHAPE_RANGE_NUM 2 #define ACL_UNKNOWN_RANK 0xFFFFFFFFFFFFFFFE diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc index 8685e17d7..678259ccb 100644 --- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc +++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc @@ -234,16 +234,16 @@ aclError acltdtAddDataItem(acltdtDataset *dataset, acltdtDataItem *dataItem) { return ACL_SUCCESS; } -bool gAclTdtSendTensorMock = false; +bool g_AclTdtSendTensorMock = false; void setAclTdtSendTensorMockStub(const bool isDriverSuccess) { - gAclTdtSendTensorMock = isDriverSuccess; + g_AclTdtSendTensorMock = isDriverSuccess; } aclError acltdtSendTensor(const acltdtChannelHandle *handle, const acltdtDataset *dataset, int32_t timeout) { - if (gAclTdtSendTensorMock) { - return ACL_ERROR_DRV_FAILURE; + if (g_AclTdtSendTensorMock) { + return ACL_ERROR_RT_QUEUE_FULL; } if (dataset == nullptr || handle == nullptr) { return ACL_ERROR_INVALID_PARAM; diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h index a7df8034b..16c334f03 100644 --- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h +++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h @@ -36,7 +36,7 @@ void SetTensorDescSize(uint32_t val); extern bool g_loadModelStatus; void SetAclLoadModelFlag(bool load_status); -extern bool gAclTdtSendTensorMock; +extern bool g_AclTdtSendTensorMock; void setAclTdtSendTensorMockStub(const bool isSuccess); struct acltdtDataItem { diff --git a/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc b/tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dataset_st.cc similarity index 100% rename from tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dats_set_st.cc rename to tf_adapter/tests/st/kernels/testcase/dataset/host_queue_dataset_st.cc diff --git a/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc b/tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dataset_ut.cc similarity index 100% rename from tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dats_set_ut.cc rename to tf_adapter/tests/ut/kernels/testcase/dataset/host_queue_dataset_ut.cc diff --git a/tf_adapter/util/acl_channel.cc b/tf_adapter/util/acl_channel.cc index dd42b9b1b..57973f90d 100644 --- a/tf_adapter/util/acl_channel.cc +++ b/tf_adapter/util/acl_channel.cc @@ -239,21 +239,21 @@ Status RecvTensorByAcl(const acltdtChannelHandle *acl_handle, std::vector &tensors, bool &is_need_resend) { - std::vector> buff_list; - acltdtDataset *acl_dataset = nullptr; is_need_resend = false; + acltdtDataset *acl_dataset = nullptr; + std::vector> buff_list; TF_RETURN_IF_ERROR(AssembleTensors2AclDataset(acl_type, tensors, &acl_dataset, buff_list)); - const int32_t kTimeOut = 3000; - auto acl_status = acltdtSendTensor(acl_handle, acl_dataset, kTimeOut); + const int32_t kTimeout = 3000; + auto acl_status = acltdtSendTensor(acl_handle, acl_dataset, kTimeout); TF_RETURN_IF_ERROR(DestroyAclDataset(acl_dataset)); if (acl_status == ACL_ERROR_RT_QUEUE_FULL) { is_need_resend = true; - ADP_LOG(INFO) << "Send data ret != 0 , need send data again."; + ADP_LOG(INFO) << "Queue is full , try to send data again."; return Status::OK(); } if (acl_status != ACL_ERROR_NONE) { std::string error_message = ge::GEGetErrorMsg(); - LOG(ERROR) << "Failed to send data by acl, error code : "<< acl_status << std::endl + LOG(FATAL) << "Failed to send data by acl, error code : "<< acl_status << std::endl << "Error Message is " << std::endl << error_message; return errors::Internal("Acl send data failed, acl status:", acl_status); -- Gitee From 8afc5bd1190419154fc01698c4939d5ab19b5108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BE=AF=E8=B4=BA?= Date: Thu, 6 Apr 2023 08:56:24 +0000 Subject: [PATCH 18/22] =?UTF-8?q?!2195=20cm=5Fworker=5Fsize=E9=80=82?= =?UTF-8?q?=E9=85=8D=20Merge=20pull=20request=20!2195=20from=20=E4=BE=AF?= =?UTF-8?q?=E8=B4=BA/cherry-pick-1680592727?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../python/npu_bridge/estimator/npu/npu_common.py | 2 +- .../python/npu_bridge/estimator/npu/npu_optimizer.py | 12 ++++++------ .../python/npu_bridge/estimator/npu/npu_strategy.py | 3 ++- tf_adapter/python/npu_bridge/estimator/npu/util.py | 5 ++--- tf_adapter/python/npu_bridge/experimental/hccl.py | 3 ++- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py index e2a5b1e97..cbf621104 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_common.py @@ -208,7 +208,7 @@ class NPUBasics(object): checkpoint_dir = os.getenv('LOCAL_CHECKPOINT_DIR', "") # cann't get rank_size from env, set to default 1 - rank_size = os.getenv('RANK_SIZE', '1') + rank_size = util_lib.get_ranksize() if rank_size.isdigit() is False: print("set rank_size to default 1") rank_size = 1 diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py index 36fb6485a..7448bcdfa 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py @@ -303,7 +303,7 @@ class NPUDistributedOptimizer(tf.train.Optimizer): """ logging.debug("compute_gradients...") gradients = self._optimizer.compute_gradients(*args, **kwargs) - rank_size = os.getenv('RANK_SIZE') + rank_size = util.get_ranksize() if rank_size is None or int(rank_size) <= 1: return gradients @@ -322,7 +322,7 @@ class NPUDistributedOptimizer(tf.train.Optimizer): def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients on variables""" - rank_size = os.getenv('RANK_SIZE') + rank_size = util.get_ranksize() if rank_size is None or int(rank_size) <= 1: return self._optimizer.apply_gradients(grads_and_vars, global_step, name) @@ -405,7 +405,7 @@ class KerasDistributeOptimizer(optimizer_v2.OptimizerV2): def new_get_gradient(loss, params): grads = old_get_gradient(loss, params) - rank_size = os.getenv('RANK_SIZE', '1') + rank_size = util.get_ranksize() if rank_size is None or int(rank_size) <= 1: return grads averaged_grads = [] @@ -435,7 +435,7 @@ class KerasDistributeOptimizer(optimizer_v2.OptimizerV2): def _compute_gradients(self, loss, var_list, grad_loss=None): gradients = self._optimizer._compute_gradients(loss, var_list, grad_loss) - rank_size = os.getenv('RANK_SIZE', '1') + rank_size = util.get_ranksize() if rank_size is None or int(rank_size) <= 1: return gradients averaged_grads = [] @@ -453,7 +453,7 @@ def npu_distributed_optimizer_wrapper(optimizer): """ if isinstance(optimizer, str): optimizer = optimizers.get(optimizer) - rank_size = os.getenv('RANK_SIZE') + rank_size = util.get_ranksize() if hasattr(optimizer, "compute_gradients"): org_compute_gradients = optimizer.compute_gradients @@ -515,7 +515,7 @@ def _npu_allreduce(values, reduction="mean", fusion=1, fusion_id=-1, group="hccl reduction = "sum" reduced_values = [] - size = int(os.getenv("RANK_SIZE", "1")) + size = int(util.get_ranksize()) for value in values: if isinstance(value, tf.IndexedSlices): # For IndexedSlices, do two allgathers intead of an allreduce. diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_strategy.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_strategy.py index 062a1dec7..25936eba5 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_strategy.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_strategy.py @@ -20,6 +20,7 @@ import os from tensorflow.python.distribute import distribute_lib from tensorflow.python.distribute import one_device_strategy +from npu_bridge.estimator.npu import util as util_lib from hccl.manage.api import get_rank_size from hccl.manage.api import get_rank_id @@ -33,7 +34,7 @@ class NPUExtended(one_device_strategy.OneDeviceExtended): @property def _num_replicas_in_sync(self): - rank_size = os.getenv("RANK_SIZE", "1") + rank_size = util_lib.get_ranksize() return int(rank_size) def _experimental_distribute_dataset(self, dataset): diff --git a/tf_adapter/python/npu_bridge/estimator/npu/util.py b/tf_adapter/python/npu_bridge/estimator/npu/util.py index f48f1dc2f..b59752d7c 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/util.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/util.py @@ -232,10 +232,9 @@ def set_iteration_per_loop(sess, train_op, iterations_per_loop=1): def get_ranksize(): - if os.getenv("CM_WORK_SIZE") is not None and os.getenv("RANK_SIZE") is not None: + if os.getenv("CM_WORKER_SIZE") is not None and os.getenv("RANK_SIZE") is not None: raise ValueError("RANK_SIZE and CM_WORK_SIZE cannot be configured at the same time") - rank_size = os.getenv('RANK_SIZE') if os.getenv( - "RANK_SIZE") is not None else os.getenv('CM_WORK_SIZE', '1') + rank_size = os.getenv('RANK_SIZE') if os.getenv("RANK_SIZE") is not None else os.getenv('CM_WORKER_SIZE', '1') return rank_size diff --git a/tf_adapter/python/npu_bridge/experimental/hccl.py b/tf_adapter/python/npu_bridge/experimental/hccl.py index 9360b26c6..e33a55a11 100644 --- a/tf_adapter/python/npu_bridge/experimental/hccl.py +++ b/tf_adapter/python/npu_bridge/experimental/hccl.py @@ -19,6 +19,7 @@ import os import ctypes +from npu_bridge.estimator.npu import util as util_lib hccl_graph_adp_ctypes = ctypes.CDLL('libhcom_graph_adaptor.so') @@ -38,7 +39,7 @@ def get_actual_rank_size(group="hccl_world_group"): def get_user_rank_size(): - rank_size = int(os.getenv('RANK_SIZE')) + rank_size = int(util_lib.get_ranksize()) return rank_size -- Gitee From 22f0d4b75bac025ef10dd469c98b2bcd0fd273c3 Mon Sep 17 00:00:00 2001 From: yaolun Date: Sat, 8 Apr 2023 11:16:10 +0000 Subject: [PATCH 19/22] =?UTF-8?q?!2202=20HostQueue=E5=8F=91=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BC=82=E5=B8=B8=E6=97=B6=E7=AD=89=E5=BE=85=E6=97=A5?= =?UTF-8?q?=E5=BF=97=E8=90=BD=E7=9B=98=20Merge=20pull=20request=20!2202=20?= =?UTF-8?q?from=20yaolun/florence=5Fc29?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/kernels/aicpu/host_queue_dataset_op.cc | 6 +++--- tf_adapter/util/acl_channel.cc | 11 ++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc index 070404811..14a452d9c 100644 --- a/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc +++ b/tf_adapter/kernels/aicpu/host_queue_dataset_op.cc @@ -628,7 +628,7 @@ class HostQueueDatasetOp : public DatasetOpKernel { Status SendDataByAclQueue(const vector &args, const acltdtTensorType &data_type, const uint64_t args_total_bytes) { Status status = Status::OK(); - bool is_need_resend = false; + bool need_resend = false; while (!finish_send_) { if (IsHoldDataTrans()) { @@ -640,9 +640,9 @@ class HostQueueDatasetOp : public DatasetOpKernel { continue; } auto start = std::chrono::steady_clock::now(); - status = SendTensorsByAcl(acl_handle_, data_type, args, is_need_resend); + status = SendTensorsByAcl(acl_handle_, data_type, args, need_resend); if (!status.ok()) { break; } - if (!is_need_resend) { + if (!need_resend) { auto end = std::chrono::steady_clock::now(); auto elapsed_time = std::chrono::duration(end - start).count(); RefreshDataThreadPerf(ThreadType::SEND, elapsed_time, args_total_bytes); diff --git a/tf_adapter/util/acl_channel.cc b/tf_adapter/util/acl_channel.cc index 57973f90d..faa01b8d7 100644 --- a/tf_adapter/util/acl_channel.cc +++ b/tf_adapter/util/acl_channel.cc @@ -24,6 +24,10 @@ #include "tf_adapter/util/util.h" #include "ge/ge_api.h" namespace tensorflow { +namespace { + const uint32_t kWaitingForLogRecord = 1U; +} + Status MappingTfDtypeToAcl(const tensorflow::DataType tf_type, aclDataType &acl_type) { const static std::map type_mapping = { {DT_FLOAT, ACL_FLOAT}, {DT_HALF, ACL_FLOAT16}, {DT_INT8, ACL_INT8}, @@ -238,8 +242,8 @@ Status RecvTensorByAcl(const acltdtChannelHandle *acl_handle, std::vector &tensors, bool &is_need_resend) { - is_need_resend = false; + const std::vector &tensors, bool &need_resend) { + need_resend = false; acltdtDataset *acl_dataset = nullptr; std::vector> buff_list; TF_RETURN_IF_ERROR(AssembleTensors2AclDataset(acl_type, tensors, &acl_dataset, buff_list)); @@ -247,11 +251,12 @@ Status SendTensorsByAcl(const acltdtChannelHandle *acl_handle, acltdtTensorType auto acl_status = acltdtSendTensor(acl_handle, acl_dataset, kTimeout); TF_RETURN_IF_ERROR(DestroyAclDataset(acl_dataset)); if (acl_status == ACL_ERROR_RT_QUEUE_FULL) { - is_need_resend = true; + need_resend = true; ADP_LOG(INFO) << "Queue is full , try to send data again."; return Status::OK(); } if (acl_status != ACL_ERROR_NONE) { + sleep(kWaitingForLogRecord); std::string error_message = ge::GEGetErrorMsg(); LOG(FATAL) << "Failed to send data by acl, error code : "<< acl_status << std::endl << "Error Message is " << std::endl -- Gitee From cf65d2f3370930e6e92995f40fb2fc8c7fd9320b Mon Sep 17 00:00:00 2001 From: yaolun Date: Mon, 10 Apr 2023 01:44:39 +0000 Subject: [PATCH 20/22] =?UTF-8?q?!2204=20=E5=91=8A=E8=AD=A6=E6=B8=85?= =?UTF-8?q?=E7=90=86=20Merge=20pull=20request=20!2204=20from=20yaolun/flor?= =?UTF-8?q?ence=5Fc29?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/util/acl_channel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tf_adapter/util/acl_channel.h b/tf_adapter/util/acl_channel.h index b1a1ac4d4..53b3ab126 100644 --- a/tf_adapter/util/acl_channel.h +++ b/tf_adapter/util/acl_channel.h @@ -42,7 +42,7 @@ Status DestroyAclDataset(acltdtDataset *acl_dataset, bool include_data_item = tr Status RecvTensorByAcl(const acltdtChannelHandle *acl_handle, std::vector &tensors); Status SendTensorsByAcl(const acltdtChannelHandle *acl_handle, acltdtTensorType acl_type, - const std::vector &tensors, bool &is_need_resend); + const std::vector &tensors, bool &need_resend); Status StopRecvTensorByAcl(acltdtChannelHandle **handle, const std::string &channel_name); -- Gitee From 1b11ff2a2b10d2d79f59270acdecafe4329210b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=B7=E6=AC=A2?= Date: Thu, 20 Apr 2023 00:47:12 +0000 Subject: [PATCH 21/22] =?UTF-8?q?!2224=20=E5=90=8C=E6=AD=A5DTS202304130460?= =?UTF-8?q?7=E4=BF=AE=E6=94=B9=E5=88=B0C29=E5=88=86=E6=94=AF=20Merge=20pul?= =?UTF-8?q?l=20request=20!2224=20from=20=E9=9B=B7=E6=AC=A2/r1.12.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/kernels/geop_npu.cc | 50 +++++++++++++++---- tf_adapter/kernels/geop_npu.h | 5 +- .../depends/ascendcl/src/ascendcl_stub.cc | 2 + 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index fd5d3a630..19aa69124 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -291,7 +291,7 @@ GeOp::GeOp(OpKernelConstruction *ctx) compute_graph_empty_(false), is_input_convert_(false), data_format_(""), graph_id_(0), is_initialized_graph_(false), need_iteration_(false), tf_session_(""), ge_session_(nullptr), job_type_(""), is_host_graph_(false), handle_(nullptr), need_compile_graph_first_(false), tuned_flag_(ATOMIC_FLAG_INIT), - jit_compile_(false), is_getnext_dynamic_shape_(false), session_id_(0), aoe_initialize_(nullptr), + jit_compile_(""), is_dynamic_input_(false), session_id_(0), aoe_initialize_(nullptr), aoe_finalize_(nullptr), aoe_create_session_(nullptr), aoe_destroy_session_(nullptr), aoe_set_gesession_(nullptr), aoe_set_dependgraphs_(nullptr), aoe_set_tuninggraph_(nullptr), aoe_tuning_graph_(nullptr), aoe_set_depend_graphs_inputs_(nullptr), aoe_set_tuning_graph_input_(nullptr) { @@ -329,7 +329,7 @@ void GeOp::Initialize(OpKernelConstruction *ctx) { ctx->GetAttr("_dynamic_input", &dynamic_input_); if (!dynamic_input_.empty() && dynamic_input_ == "1") { jit_compile_ = true; - is_getnext_dynamic_shape_ = true; + is_dynamic_input_ = true; OP_REQUIRES_OK(ctx, ctx->GetAttr("_dynamic_graph_execute_mode", &dynamic_graph_execute_mode_)); ctx->GetAttr("_getnext_inputs_shape_range", &getnext_inputs_shape_range_); ctx->GetAttr("_data_inputs_shape_range", &data_inputs_shape_range_); @@ -341,7 +341,7 @@ void GeOp::Initialize(OpKernelConstruction *ctx) { ADP_LOG(INFO) << "[GEOP] dynamic_input: " << dynamic_input_ << ", dynamic_graph_execute_mode: " << dynamic_graph_execute_mode_ << ", jit_compile: " << jit_compile_ - << ", is_getnext_dynamic_shape_: " << is_getnext_dynamic_shape_ + << ", is_dynamic_input: " << is_dynamic_input_ << ", getnext_inputs_shape_range: " << getnext_inputs_shape_range_ << ", data_inputs_shape_range: " << data_inputs_shape_range_ << ", is_train_graph: " << is_train_graph_ << ", is_dynamic_getnext: " << is_dynamic_getnext_ << ", placeholder_index: " << placeholder_index_ @@ -1106,6 +1106,18 @@ void GeOp::BuildQueueDataAndGetNextFromQueue(Graph &graph, const Node &getnext_n get_next_node_def.mutable_attr()->insert({"op_def", get_next_attr}); } +bool GeOp::IsDynamicGetNext(const Node *node) { + if (is_dynamic_input_) { + return true; + } + auto it = is_getnext_dynamic_shape_.find(node->name()); + if (it == is_getnext_dynamic_shape_.end()) { + return false; + } else { + return it->second; + } +} + void GeOp::HandleDpOpAndGetNextNodes(Graph &graph) { std::vector remove_nodes; for (Node *node : graph.nodes()) { @@ -1143,14 +1155,14 @@ void GeOp::HandleDpOpAndGetNextNodes(Graph &graph) { remove_nodes.push_back(iterator_node); } } else if (NpuAttrs::IsDatasetExecuteInDevice(tf_session_ + iterator_name)) { - if (is_getnext_dynamic_shape_) { + if (IsDynamicGetNext(node)) { node_def.set_op("DynamicGetNext"); } } else { Node *aicpu_getnext = nullptr; std::string aicpu_getnext_name = "aicpu_getnext_" + node->name(); auto getnext_attrs = node->def().attr(); - std::string aicpu_getnext_type = is_getnext_dynamic_shape_ ? "DynamicGetNextV2" : "GetNext"; + std::string aicpu_getnext_type = IsDynamicGetNext(node) ? "DynamicGetNextV2" : "GetNext"; TF_CHECK_OK(NodeBuilder(aicpu_getnext_name, aicpu_getnext_type) .Device(node->def().device()) .Attr("channel_name", channel_name) @@ -1219,18 +1231,38 @@ Status GeOp::ProcessForDiffNodeTypes(Graph &graph, bool &is_initialize, bool &is } void GeOp::ProcessGetNextNode(const Node *node) { + bool is_dynamic_shape = false; + const char *kTypeAttrName = "output_types"; + const char *kShapeAttrName = "output_shapes"; + std::vector type_attrs; std::vector shape_attrs; - const char *kAttrName = "output_shapes"; - if (tensorflow::TryGetNodeAttr(node->attrs(), kAttrName, &shape_attrs)) { + if (tensorflow::TryGetNodeAttr(node->attrs(), kShapeAttrName, &shape_attrs)) { for (auto i = 0; i < node->num_outputs(); i++) { const TensorShapeProto &shape_proto = *shape_attrs[i]; tensorflow::PartialTensorShape shape(shape_proto); if (!shape.IsFullyDefined()) { - is_getnext_dynamic_shape_ = true; - ADP_LOG(INFO) << "[GEOP]node: " + node->name() + " is_getnext_dynamic_shape_ come true."; + jit_compile_ = "0"; + is_dynamic_shape = true; + ADP_LOG(INFO) << "[GEOP]node: " + node->name() + " is_dynamic_shape come true."; } } } + if (is_dynamic_shape == false && + tensorflow::TryGetNodeAttr(node->attrs(), kTypeAttrName, &type_attrs)) { + for (auto i = 0; i < node->num_outputs(); i++) { + if (DT_STRING == type_attrs[i]) { + jit_compile_ = "0"; + is_dynamic_shape = true; + ADP_LOG(INFO) << "[GEOP]node: " + node->name() + "'s output_types include DT_STRING."; + } + } + } + auto it = is_getnext_dynamic_shape_.find(node->name()); + if (it == is_getnext_dynamic_shape_.end()) { + (void)is_getnext_dynamic_shape_.insert(std::make_pair(node->name(), is_dynamic_shape)); + } else { + ADP_LOG(WARNING) << "[GEOP]node: " + node->name() + " has is_dynamic_shape[" << it->second << "]."; + } } void GeOp::UpdateInputsShapeDesc(Graph &graph) { diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h index f2b970c1b..c714b2e45 100644 --- a/tf_adapter/kernels/geop_npu.h +++ b/tf_adapter/kernels/geop_npu.h @@ -126,6 +126,8 @@ private: void HandleDpOpAndGetNextNodes(Graph &graph); + bool IsDynamicGetNext(const Node *node); + void ChangeChannelNameAttr(NodeDef &node_def) const; bool IsDynamicConfig(); @@ -193,7 +195,8 @@ private: std::string recompute_mode_; std::vector> input_shapes_vec_; bool jit_compile_; - bool is_getnext_dynamic_shape_; + bool is_dynamic_input_; + std::map is_getnext_dynamic_shape_; SessionId session_id_; AoeInitializeFunc aoe_initialize_; AoeFinalizeFunc aoe_finalize_; diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc index 678259ccb..63278135f 100644 --- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc +++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc @@ -243,6 +243,8 @@ aclError acltdtSendTensor(const acltdtChannelHandle *handle, const acltdtDataset *dataset, int32_t timeout) { if (g_AclTdtSendTensorMock) { + // 这里保证ACL_ERROR_RT_QUEUE_FULL只返回一次,否则会导致日志持续刷屏 + g_AclTdtSendTensorMock = false; return ACL_ERROR_RT_QUEUE_FULL; } if (dataset == nullptr || handle == nullptr) { -- Gitee From 4e109f00f31ccb90d71e3a60c877fc0bf2c76a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=B7=E6=AC=A2?= Date: Fri, 21 Apr 2023 15:48:07 +0800 Subject: [PATCH 22/22] =?UTF-8?q?=E5=9B=9E=E9=80=80=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_adapter/kernels/geop_npu.cc | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index 19aa69124..e9e9dba51 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -1247,16 +1247,6 @@ void GeOp::ProcessGetNextNode(const Node *node) { } } } - if (is_dynamic_shape == false && - tensorflow::TryGetNodeAttr(node->attrs(), kTypeAttrName, &type_attrs)) { - for (auto i = 0; i < node->num_outputs(); i++) { - if (DT_STRING == type_attrs[i]) { - jit_compile_ = "0"; - is_dynamic_shape = true; - ADP_LOG(INFO) << "[GEOP]node: " + node->name() + "'s output_types include DT_STRING."; - } - } - } auto it = is_getnext_dynamic_shape_.find(node->name()); if (it == is_getnext_dynamic_shape_.end()) { (void)is_getnext_dynamic_shape_.insert(std::make_pair(node->name(), is_dynamic_shape)); -- Gitee