diff --git a/README.md b/README.md index 09af0eeb0aec65e53a762b18e4676862219513ee..3b3a6306a55fde421afb9328e520afc3eefe45da 100755 --- a/README.md +++ b/README.md @@ -50,13 +50,15 @@ chmod +x build.sh ./build.sh ``` -脚本执行成功后,会在output目录生成tfadapter.tar压缩文件。 +编译结束后,安装包会生成在 +``` +./build/tfadapter/dist/python/dist/npu_bridge-1.15.0-py3-none-any.whl +``` #### 安装插件包 -解压tfadapter.tar文件,生成npu_bridge-1.15.0-py3-none-any.whl, -然后使用 pip 安装 TF_Adapter 插件。 +使用 pip3 安装 Ascend Adapter 到您期望的位置。 ``` -pip install npu_bridge-1.15.0-py3-none-any.whl +pip3 install ./build/tfadapter/dist/python/dist/npu_bridge-1.15.0-py3-none-any.whl --upgrade ``` 需要注意的是, 您应当保证安装路径与您编译时指定的 python 解释器搜索路径是一致的。 @@ -72,6 +74,13 @@ https://gitee.com/ascend/tensorflow/wikis/Home?sort_id=3076366 Release Notes请参考[RELEASE](RELEASE.md). +## FAQ +#### 执行./build.sh时提示配置swig的路径 +需要执行以下命令安装swig +``` +pip3 install swig +``` + ## License [Apache License 2.0](LICENSE) diff --git a/build.sh b/build.sh index d28e661733f2ce7ae8f0bc4ed64ef578d068752f..02fac729ecf5380ed0d7e21de140b5236c972e61 100755 --- a/build.sh +++ b/build.sh @@ -25,7 +25,7 @@ RELEASE_TARGET="tfadapter.tar" # print usage message usage() { echo "Usage:" - echo " bash build.sh [-h] [-j[n]] [-v] [-g] [-u]" + echo " bash build.sh [-h] [-j[n]] [-v] [-g] [-u] [-s] [-c]" echo "" echo "Options:" echo " -h Print usage" @@ -34,6 +34,7 @@ usage() { echo " -g GCC compiler prefix, used to specify the compiler toolchain" echo " -u TF_adapter utest" echo " -s TF_adapter stest" + echo " -c TF_adapter ci build" echo "to be continued ..." } @@ -48,8 +49,9 @@ checkopts() { GCC_PREFIX="" ENABLE_TFADAPTER_UT="off" ENABLE_TFADAPTER_ST="off" + ENABLE_CI_BUILD="off" # Process the options - while getopts 'hj:vusg:' opt + while getopts 'hj:vuscg:' opt do case "${opt}" in h) usage @@ -59,6 +61,7 @@ checkopts() { g) GCC_PREFIX=$OPTARG ;; u) ENABLE_TFADAPTER_UT="on" ;; s) ENABLE_TFADAPTER_ST="on" ;; + c) ENABLE_CI_BUILD="on" ;; *) logging "Undefined option: ${opt}" usage exit 1 ;; @@ -125,7 +128,7 @@ main() { ${GCC_PREFIX}g++ -v mk_dir "${RELEASE_PATH}" build_tfadapter - if [[ "X$ENABLE_TFADAPTER_UT" = "Xoff" ]] && [[ "X$ENABLE_TFADAPTER_ST" = "Xoff" ]]; then + if [[ "X$ENABLE_TFADAPTER_UT" = "Xoff" ]] && [[ "X$ENABLE_TFADAPTER_ST" = "Xoff" ]] && [[ "X$ENABLE_CI_BUILD" = "Xon" ]]; then release_tfadapter fi if [[ "X$ENABLE_TFADAPTER_UT" = "Xon" ]]; then diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh index c89bc3c6bc63a016b3ccfa1f9abc353a98e12088..41ea39a3029ae5d39b78f0502d4db67c3a1c3a65 100644 --- a/tf_adapter/interface_spec/api_npu_config.pyh +++ b/tf_adapter/interface_spec/api_npu_config.pyh @@ -12,7 +12,7 @@ class NPURunConfig(run_config_lib.RunConfig): enable_exception_dump=0, op_select_implmode=None, optypelist_for_implmode=None, dynamic_input_config=None, aoe_mode=None, work_path=None, buffer_optimize="l2_optimize", enable_small_channel=0, fusion_switch_file=None, enable_compress_weight=False, compress_weight_conf=None, - op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=False, + op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=None, dynamic_graph_execute_mode="dynamic_execute", dynamic_inputs_shape_range=None, train_distribute=None, eval_distribute=None, local_rank_id=None, local_device_list=None, session_device_id=None, distribute_config=None, modify_mixlist=None, op_precision_mode=None, device_type="default_device_type", diff --git a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc index 13588e8c9b048a3a7a49f2c3cfcb7ef87f11dd09..ec9919b9a8527dec4850f6d64e3b4f00222060c1 100644 --- a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc +++ b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc @@ -2263,7 +2263,9 @@ Status OMPartitionSubgraphsPass::ProcessGraph(std::unique_ptr *graph, Fun return Status::OK(); } if (mix_compile_mode) { - TF_RETURN_IF_ERROR(CopyVarsBetweenGeOp(graph_in)); + if (pass_options["variable_location"] != "Host") { + TF_RETURN_IF_ERROR(CopyVarsBetweenGeOp(graph_in)); + } TF_RETURN_IF_ERROR(CopyConstBetweenGeOp(graph_in)); } diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py b/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py index 0d596fed7eb104f104bd63e076d57531ad4d0bfc..4ae5f2df8e94fd429ee31c3dfdac0bcf254148f1 100644 --- a/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py +++ b/tf_adapter/python/npu_bridge/embedding/embedding_optimizer.py @@ -19,6 +19,7 @@ from tensorflow.python.framework import ops from tensorflow.python.eager import context from tensorflow.python.ops import math_ops from tensorflow.python.ops import control_flow_ops +from tensorflow.core.framework import attr_value_pb2 from tensorflow.python.training import adam from tensorflow.python.training import adagrad from tensorflow.python.training import training_ops @@ -38,6 +39,14 @@ class AdamOptimizer(adam.AdamOptimizer): def embedding_dims(self, val): self._embedding_dims = val + @property + def max_nums(self): + return self._max_nums + + @max_nums.setter + def max_nums(self, val): + self._max_nums = val + def _prepare(self): lr = self._call_if_callable(self._lr) epsilon = self._call_if_callable(self._epsilon) @@ -56,15 +65,18 @@ class AdamOptimizer(adam.AdamOptimizer): self._beta2_t_list.append(self._beta2_t) beta1_power, beta2_power = self._get_beta_accumulators() self.table_idx += 1 - return gen_npu_cpu_ops.embedding_apply_adam(var.handle, beta1_power, beta2_power, - math_ops.cast(self._lr_t, grad.dtype), - math_ops.cast(self._beta1_t, grad.dtype), - math_ops.cast(self._beta2_t, grad.dtype), - math_ops.cast(self._epsilon_t, grad.dtype), - grad, - indices, - ops.convert_to_tensor(_GLOBAL_STEP_VALUE), - self._embedding_dims) + result = gen_npu_cpu_ops.embedding_apply_adam(var.handle, beta1_power, beta2_power, + math_ops.cast(self._lr_t, grad.dtype), + math_ops.cast(self._beta1_t, grad.dtype), + math_ops.cast(self._beta2_t, grad.dtype), + math_ops.cast(self._epsilon_t, grad.dtype), + grad, + indices, + ops.convert_to_tensor(_GLOBAL_STEP_VALUE), + self._embedding_dims) + result.op._set_attr("_embedding_dim", attr_value_pb2.AttrValue(i=self._embedding_dims)) + result.op._set_attr("_max_num", attr_value_pb2.AttrValue(i=self._max_nums)) + return result else: return self._apply_sparse_shared(grad, var, indices, self._resource_scatter_add) diff --git a/tf_adapter/python/npu_bridge/embedding/embedding_service.py b/tf_adapter/python/npu_bridge/embedding/embedding_service.py index a661e11c99bb6ca81f36700cc1e5b9c270e2181b..b1214294d970c9f1db4956462545416517ec1f08 100644 --- a/tf_adapter/python/npu_bridge/embedding/embedding_service.py +++ b/tf_adapter/python/npu_bridge/embedding/embedding_service.py @@ -66,6 +66,10 @@ class ESWorker: self._init_embedding_hash_maps = {} self._init_partition_maps = {} self._table_to_embedding_dim = {} + self._table_to_max_num = {} + self._table_to_optimizer = {} + self._table_to_initializer = {} + self._table_to_slot_var_num = {} for each_ps in self._ps_ids_list: self._ps_ids.append(each_ps["id"]) self._train_mode = True @@ -117,7 +121,9 @@ class ESWorker: self._embedding_dim = embedding_dim self._max_num = max_batch_size self._table_to_embedding_dim[table_id] = embedding_dim + self._table_to_max_num[table_id] = max_batch_size self._initializer = initializer + self._table_to_initializer[table_id] = initializer self._table_has_init.append(table_id) bucket_size = math.ceil(vocabulary_size / self._ps_num) if optimizer is None: @@ -135,8 +141,11 @@ class ESWorker: raise ValueError("initializer must be random_uniform or truncated_normal.") self._optimizer = optimizer self._optimizer._embedding_dims = embedding_dim + self._optimizer._max_nums = max_batch_size + self._table_to_optimizer[table_id] = self._optimizer # adam include m and v, 2 slots; adagrad include accumulator, 1 slot self.slot_vars_num = 2 if isinstance(self._optimizer, embedding_optimizer.AdamOptimizer) else 1 + self._table_to_slot_var_num[table_id] = self.slot_vars_num if (file_path is None) or (file_name is None) or (not tf.gfile.Exists(os.path.join(file_path, file_name))): if initializer is None: raise ValueError("In new embedding training, initializer can not be None.") @@ -180,18 +189,18 @@ class ESWorker: keys=input_ids, embedding_dim= self._table_to_embedding_dim.get(table_id), - random_alg=self._initializer, + random_alg=self._table_to_initializer.get(table_id), seed=seed1, seed2=seed2, value_total_len= self._table_to_embedding_dim.get(table_id) * - (self.slot_vars_num + 1) + (self._table_to_slot_var_num.get(table_id) + 1) ) else: result = gen_npu_cpu_ops.embedding_table_find(table_id=ops.convert_to_tensor(table_id), keys=input_ids, embedding_dim=self._table_to_embedding_dim.get(table_id)) - result.op._set_attr("_embedding_dim", attr_value_pb2.AttrValue(i=self._embedding_dim)) - result.op._set_attr("_max_num", attr_value_pb2.AttrValue(i=self._max_num)) + result.op._set_attr("_embedding_dim", attr_value_pb2.AttrValue(i=self._table_to_embedding_dim.get(table_id))) + result.op._set_attr("_max_num", attr_value_pb2.AttrValue(i=self._table_to_max_num.get(table_id))) result.op._set_attr("_deploy_inject_config", attr_value_pb2.AttrValue(s=tf.compat.as_bytes(self._es_cluster_conf))) return result @@ -222,12 +231,13 @@ class ESWorker: or (len(table_ids) != len(input_ids_list)): raise ValueError("The length of params, table_ids, input_ids_list should be equal.") embedding_grads = tf.gradients(loss, params) - params_grads = [] - for i in range(len(embedding_grads)): - params_grads.append(tf.IndexedSlices(embedding_grads[i], input_ids_list[i], dense_shape=params[i].shape)) + update_op = [] with specified_ps_engine_scope(): - var_refs = [NpuEmbeddingResource(table_id) for table_id in table_ids] - update_op = self._optimizer.apply_gradients(list(zip(params_grads, var_refs))) + for i in range(len(table_ids)): + params_grads = [tf.IndexedSlices(embedding_grads[i], input_ids_list[i], dense_shape=params[i].shape)] + var_refs = [NpuEmbeddingResource(table_ids[i])] + update_op.append( + self._table_to_optimizer.get(table_ids[i]).apply_gradients(list(zip(params_grads, var_refs)))) return update_op # 提供训练好的embedding values save功能 @@ -264,7 +274,8 @@ class ESWorker: with specified_ps_engine_scope(): embedding_dim = self._table_to_embedding_dim.get(table_id) return gen_npu_cpu_ops.embedding_table_export(file_path, file_name, ops.convert_to_tensor(-1), table_id, - embedding_dim, embedding_dim * (self.slot_vars_num + 1), + embedding_dim, embedding_dim * + (self._table_to_slot_var_num.get(table_id) + 1), False, mode) def data_parallel_embedding(self, max_vocabulary_size, embedding_dim, multihot_lens, allow_merge=True): diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 6004fecb7b0090fedc69afca15d3e3463ccd4fdb..63c1188340b7c0283ed2312a44fa721f55dad4a5 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -78,7 +78,7 @@ class NPURunConfig(run_config_lib.RunConfig): op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, - dynamic_input=False, + dynamic_input=None, dynamic_graph_execute_mode="dynamic_execute", dynamic_inputs_shape_range=None, train_distribute=None, diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index c3199d8fbe5b92d51223c9e631ab329b7de27353..51191212cab66c9a94dea2a313c5bda02190bdcd 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -740,7 +740,8 @@ class NPUEstimator(estimator_lib.Estimator): if config._debug_dir is not None: custom_op.parameter_map["debug_dir"].s = tf.compat.as_bytes(config._debug_dir) custom_op.parameter_map["hcom_multi_mode"].b = config._hcom_multi_mode - custom_op.parameter_map["dynamic_input"].b = config._dynamic_input + if config._dynamic_input is not None: + custom_op.parameter_map["dynamic_input"].b = config._dynamic_input custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes(config._dynamic_graph_execute_mode) if config._dynamic_inputs_shape_range is not None: custom_op.parameter_map["dynamic_inputs_shape_range"].s = tf.compat.as_bytes( diff --git a/tf_adapter/util/util.cc b/tf_adapter/util/util.cc index f7a1e8c81a99221fcda2005c248bd7b786f60a62..8cb0b1fc33b35d654ef03828514c4db51655d44c 100644 --- a/tf_adapter/util/util.cc +++ b/tf_adapter/util/util.cc @@ -115,7 +115,6 @@ bool IsVariableOrResourceVariable(const Node * const node) { bool IsVariableExecuteOnHost(const Node * const node, const std::string &variable_location) { if (variable_location == "Host" && IsVariableOrResourceVariable(node)) { - ADP_LOG(INFO) << "Node : " << node->name() << " op name : " << node->type_string() << "is execute on host"; return true; } return false;