From b535766c50472ae8016861c47510ae9813fb79a1 Mon Sep 17 00:00:00 2001 From: lzl Date: Thu, 25 Nov 2021 14:56:58 +0800 Subject: [PATCH] test --- tf_adapter/kernels/geop_npu.cc | 41 +++++++++++++------ tf_adapter/kernels/geop_npu.h | 1 + .../npu_bridge/estimator/npu/npu_config.py | 8 +++- .../npu_bridge/estimator/npu/npu_estimator.py | 4 ++ tf_adapter/util/npu_attrs.cc | 8 +++- 5 files changed, 48 insertions(+), 14 deletions(-) diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index 687a1f5b5..694cc7037 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -66,6 +66,7 @@ #include "graph/compute_graph.h" #include "graph/ge_attr_value.h" #include "graph/model.h" +#include "aoe_types.h" namespace tensorflow { Status FunctionalizeControlFlow(Graph *graph, FunctionLibraryDefinition *library); @@ -278,9 +279,9 @@ void GeOp::Initialize(OpKernelConstruction *ctx) { handle_ = mmDlopen("libaoe_tuning.so", MMPA_RTLD_NOW); OP_REQUIRES(ctx, handle_ != nullptr, errors::InvalidArgument("libaoe_tuning.so dlopen failed, ", mmDlerror())); - aoe_tuning_ = (AoeTuningFunc)mmDlsym(handle_, const_cast("AoeOnlineTuning")); - aoe_init_ = (AoeInitFunc)mmDlsym(handle_, const_cast("AoeOnlineInitialize")); - aoe_finalize_ = (AoeFinalizeFunc)mmDlsym(handle_, const_cast("AoeOnlineFinalize")); + // aoe_tuning_ = (AoeTuningFunc)mmDlsym(handle_, const_cast("AoeOnlineTuning")); + // aoe_init_ = (AoeInitFunc)mmDlsym(handle_, const_cast("AoeOnlineInitialize")); + // aoe_finalize_ = (AoeFinalizeFunc)mmDlsym(handle_, const_cast("AoeOnlineFinalize")); OP_REQUIRES(ctx, aoe_tuning_ != nullptr && aoe_init_ != nullptr && aoe_finalize_ != nullptr, errors::InvalidArgument("dlsym Aoe API failed, ", mmDlerror())); } @@ -320,7 +321,8 @@ void GeOp::Finalize() { if (!GePlugin::GetInstance()->IsGlobal()) { if (!init_options_["ge.jobType"].empty() && !init_options_["ge.tuningPath"].empty() && aoe_finalize_ != nullptr) { - AoeStatus tune_ret = (*aoe_finalize_)(); + // AoeStatus tune_ret = (*aoe_finalize_)(); + AoeStatus final_ret = AoeFinalize(); if (tune_ret != AOE_SUCCESS) { ADP_LOG(ERROR) << "[GEOP] exec aoe finalize func failed."; LOG(ERROR) << "[GEOP] exec aoe finalize func failed."; @@ -501,7 +503,18 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { tune_options_.insert(sess_options_.begin(), sess_options_.end()); tune_options_.insert({"work_path", init_options_["ge.tuningPath"]}); tune_options_.insert({"job_type", init_options_["ge.jobType"]}); - AoeStatus tune_ret = (*aoe_init_)(ge_session_, tune_options_); + // aoe ini + // AoeStatus tune_ret = (*aoe_init_)(ge_session_, tune_options_); + std::map globalOptions; + globalOptions.insert({AscendString("work_path".c_str()), AscendString(init_options_["ge.tuningPath"].c_str())}); + globalOptions.insert({AscendString("devices".c_str()), AscendString(std::to_string(device_id).c_str())}); + AoeStatus ini_ret = AoeInitialize(globalOptions); + // aoe create session + session_id_ = 0; + std::map sessionOptions; + AoeStatus session_create_ret = AoeCreateSession(session_id_, sessionOptions); + // aoe set ge session + AoeStatus ge_session_ret = AoeSetGeSession(session_id_, ge_session_); OP_REQUIRES_ASYNC(ctx, tune_ret == AOE_SUCCESS, errors::Internal("[GEOP] exec aoe init func failed."), done); } ADP_LOG(INFO) << "[GEOP] tf session: " << tf_session_ << " get ge session success."; @@ -699,8 +712,10 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { if (is_tuning) { if (is_train_graph_ != "1" && init_options_["ge.jobType"] != "2" && init_options_["ge.jobType"] != "1") { ADP_LOG(INFO) << "[GEOP] in tune mode, nontraining graphs should be cache."; - OP_REQUIRES_ASYNC(ctx, SessionManager::GetInstance().CacheGeGraphs(ge_session_, ge_graph), - errors::Internal("[GEOP] cache ge session failed."), done); + /**OP_REQUIRES_ASYNC(ctx, SessionManager::GetInstance().CacheGeGraphs(ge_session_, ge_graph), + errors::Internal("[GEOP] cache ge session failed."), done);**/ + std::vector dependGraph; + AoeStatus depend_ret = AoeSetDependGraphs(session_id_, dependGraph); build_flag_ = true; BuildOutTensorInfo(ctx); done(); @@ -708,12 +723,14 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { } else { ADP_LOG(INFO) << "[GEOP] in tune mode, training graph handled by tools."; std::vector ge_graphs; - OP_REQUIRES_ASYNC(ctx, SessionManager::GetInstance().GetGeGraphs(ge_session_, ge_graphs), - errors::Internal("[GEOP] ge ge session nontraining graphs failed."), done); + /**OP_REQUIRES_ASYNC(ctx, SessionManager::GetInstance().GetGeGraphs(ge_session_, ge_graphs), + errors::Internal("[GEOP] ge ge session nontraining graphs failed."), done);**/ tune_options_.insert(graph_options_.begin(), graph_options_.end()); - AoeStatus tune_ret = (*aoe_tuning_)(ge_graph, ge_graphs, ge_session_, tune_options_); - OP_REQUIRES_ASYNC(ctx, (tune_ret == AOE_SUCCESS) || (tune_ret == AOE_ERROR_NO_AICORE_GRAPH), - errors::Internal("[GEOP] exec aoe tuning func failed[", tune_ret, "]."), done); + // AoeStatus tune_ret = (*aoe_tuning_)(ge_graph, ge_graphs, ge_session_, tune_options_); + AoeStatus tune_ret = AoeSetTuningGraphs(session_id_, ge_graph); + AoeStatus aoe_tune_ret = AoeTuningGraphs(session_id_, tuingOptions); + //OP_REQUIRES_ASYNC(ctx, (tune_ret == AOE_SUCCESS) || (tune_ret == AOE_ERROR_NO_AICORE_GRAPH), + // errors::Internal("[GEOP] exec aoe tuning func failed[", tune_ret, "]."), done); ADP_LOG(INFO) << "[GEOP] aoe success[" << tune_ret << "]."; build_flag_ = true; BuildOutTensorInfo(ctx); diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h index f6c1d5d75..71a0acf0f 100644 --- a/tf_adapter/kernels/geop_npu.h +++ b/tf_adapter/kernels/geop_npu.h @@ -145,6 +145,7 @@ class GeOp : public AsyncOpKernel { std::map tune_options_; std::string is_dynamic_getnext_; std::string placeholder_index_; + SessionId session_id_; }; } // namespace tensorflow #endif // TENSORFLOW_KERNELS_GEOP_NPU_H_ diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 08cf8c869..043fe6ee6 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -90,7 +90,10 @@ class NPURunConfig(run_config_lib.RunConfig): distribute_config=None, modify_mixlist=None, op_precision_mode=None, - device_type="default_device_type" + device_type="default_device_type", + # test + hccl_timeout=0, + operator_timeout=0, ): """ Constructs a NPUConfig. @@ -216,6 +219,9 @@ class NPURunConfig(run_config_lib.RunConfig): self._modify_mixlist = modify_mixlist self._op_precision_mode = op_precision_mode self._device_type = device_type + # test + self.hccl_timeout=hccl_timeout + self.operator_timeout=operator_timeout super(NPURunConfig, self).__init__( model_dir=model_dir, diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index 051efe76c..7e3fc1008 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -668,6 +668,10 @@ class NPUEstimator(estimator_lib.Estimator): custom_op.parameter_map["is_tailing_optimization"].b = config.is_tailing_optimization custom_op.parameter_map["min_group_size"].b = 1 custom_op.parameter_map["hcom_parallel"].b = config._hcom_parallel + # test + custom_op.parameter_map["hccl_timeout"].b = config.hccl_timeout + custom_op.parameter_map["operator_timeout"].b = config.operator_timeout + if config._graph_memory_max_size is not None: custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(config._graph_memory_max_size)) if config._variable_memory_max_size is not None: diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index 23b93672b..67b63a636 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -583,6 +583,10 @@ std::map NpuAttrs::GetPassOptions(const GraphOptimizat mutable_rewrite_options()->set_remapping(RewriterConfig::OFF); } } + // test + if (params.count("hccl_timeout")) { hccl_timeout = params.at("in_out_pair").b(); } + if (params.count("operator_timeout")) { operator_timeout = params.at("in_out_pair").b(); } + // pass options pass_options["do_npu_optimizer"] = std::to_string(do_npu_optimizer); pass_options["enable_dp"] = std::to_string(enable_dp); @@ -599,7 +603,9 @@ std::map NpuAttrs::GetPassOptions(const GraphOptimizat pass_options["local_device_list"] = local_device_list; pass_options["in_out_pair_flag"] = std::to_string(in_out_pair_flag); pass_options["in_out_pair"] = in_out_pair; - + // test + pass_options["hccl_timeout"] = std::to_string(hccl_timeout); + pass_options["operator_timeout"] = std::to_string(operator_timeout); return pass_options; } -- Gitee