diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index e5c5af658266b12f993e8b4e3e56b36603d63112..5b0c030fd59851863b4f3c8161d2a19e39037be1 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -66,6 +66,7 @@ #include "graph/compute_graph.h" #include "graph/ge_attr_value.h" #include "graph/model.h" +#include "aoe_types.h" namespace tensorflow { Status FunctionalizeControlFlow(Graph *graph, FunctionLibraryDefinition *library); @@ -216,7 +217,9 @@ GeOp::GeOp(OpKernelConstruction *ctx) is_initialized_graph_(false), need_iteration_(false), tf_session_(""), ge_session_(nullptr), job_type_(""), is_host_graph_(false), handle_(nullptr), aoe_tuning_(nullptr), need_compile_graph_first_(false), aoe_init_(nullptr), aoe_finalize_(nullptr), - tuned_flag_(ATOMIC_FLAG_INIT) { + aoe_initialize_(nullptr), aoe_create_session_(nullptr), aoe_set_gesession_(nullptr), + aoe_set_dependgraphs_(nullptr), aoe_set_tuninggraph_(nullptr), aoe_tuning_graph_(nullptr), + session_id_(0),tuned_flag_(ATOMIC_FLAG_INIT) { Initialize(ctx); } @@ -279,9 +282,18 @@ void GeOp::Initialize(OpKernelConstruction *ctx) { handle_ = mmDlopen("libaoe_tuning.so", MMPA_RTLD_NOW); OP_REQUIRES(ctx, handle_ != nullptr, errors::InvalidArgument("libaoe_tuning.so dlopen failed, ", mmDlerror())); - aoe_tuning_ = (AoeTuningFunc)mmDlsym(handle_, const_cast("AoeOnlineTuning")); - aoe_init_ = (AoeInitFunc)mmDlsym(handle_, const_cast("AoeOnlineInitialize")); - aoe_finalize_ = (AoeFinalizeFunc)mmDlsym(handle_, const_cast("AoeOnlineFinalize")); + // aoe_tuning_ = (AoeTuningFunc)mmDlsym(handle_, const_cast("AoeOnlineTuning")); + // aoe_init_ = (AoeInitFunc)mmDlsym(handle_, const_cast("AoeOnlineInitialize")); + // aoe_finalize_ = (AoeFinalizeFunc)mmDlsym(handle_, const_cast("AoeOnlineFinalize")); + // test + aoe_initialize_ = (AoeInitializeFunc)mmDlsym(handle_, const_cast("AoeInitialize")); + aoe_finalize_ = (AoeFinalizeFunc)mmDlsym(handle_, const_cast("AoeFinalize")); + aoe_create_session_ = (AoeCreateSessionFunc)mmDlsym(handle_, const_cast("AoeCreateSession")); + aoe_set_gesession_ = (AoeSetGeSessionFunc)mmDlsym(handle_, const_cast("AoeSetGeSession")); + aoe_set_dependgraphs_ = (AoeSetDependGraphFunc)mmDlsym(handle_, const_cast("AoeSetDependGraphs")); + aoe_set_tuninggraph_ = (AoeSetTuningGraphFunc)mmDlsym(handle_, const_cast("AoeSetTuningGraph")); + aoe_tuning_graph_ = (AoeTuningGraphFunc)mmDlsym(handle_, const_cast("AoeTuningGraph")); + OP_REQUIRES(ctx, aoe_tuning_ != nullptr && aoe_init_ != nullptr && aoe_finalize_ != nullptr, errors::InvalidArgument("dlsym Aoe API failed, ", mmDlerror())); } @@ -504,8 +516,19 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { tune_options_.insert(sess_options_.begin(), sess_options_.end()); tune_options_.insert({"work_path", init_options_["ge.tuningPath"]}); tune_options_.insert({"job_type", init_options_["ge.jobType"]}); - AoeStatus tune_ret = (*aoe_init_)(ge_session_, tune_options_); - OP_REQUIRES_ASYNC(ctx, tune_ret == AOE_SUCCESS, errors::Internal("[GEOP] exec aoe init func failed."), done); + // aoe ini + // AoeStatus tune_ret = (*aoe_init_)(ge_session_, tune_options_); + std::map globalOptions; + globalOptions.insert({ge::AscendString("work_path"), ge::AscendString(init_options_["ge.tuningPath"].c_str())}); + globalOptions.insert({ge::AscendString("devices"), ge::AscendString(std::to_string(device_id).c_str())}); + AoeStatus ini_ret = (*aoe_initialize_)(globalOptions); + // aoe create session + std::map sessionOptions; + globalOptions.insert({ge::AscendString("job_type"), ge::AscendString(init_options_["ge.jobType"].c_str())}); + AoeStatus session_create_ret = (*aoe_create_session_)(sessionOptions, session_id_); + // aoe set ge session + AoeStatus ge_session_ret = (*aoe_set_gesession_)(session_id_, ge_session_); + // OP_REQUIRES_ASYNC(ctx, tune_ret == AOE_SUCCESS, errors::Internal("[GEOP] exec aoe init func failed."), done); } ADP_LOG(INFO) << "[GEOP] tf session: " << tf_session_ << " get ge session success."; sess_init_flag_ = true; @@ -649,6 +672,46 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { graph_options_["ge.exec.placement"] = "HOST"; } graph_options_["ge.shape_generalized_build_mode"] = "shape_precise"; + if (dynamic_input_ == "1") { + graph_options_["ge.exec.dynamicInput"] = dynamic_input_; + graph_options_["ge.exec.dynamicGraphExecuteMode"] = dynamic_graph_execute_mode_; + graph_options_["ge.exec.dataInputsShapeRange"] = data_inputs_shape_range_; + if (dynamic_graph_execute_mode_ == "dynamic_execute" && data_inputs_shape_range_.empty()) { + graph_options_["ge.shape_generalized_build_mode"] = "shape_generalized"; + } + } + if (is_tuning) { + if (is_train_graph_ != "1" && init_options_["ge.jobType"] != "2" && init_options_["ge.jobType"] != "1") { + ADP_LOG(INFO) << "[GEOP] in tune mode, nontraining graphs should be cache."; + /**OP_REQUIRES_ASYNC(ctx, SessionManager::GetInstance().CacheGeGraphs(ge_session_, ge_graph), + errors::Internal("[GEOP] cache ge session failed."), done);**/ + std::vector dependGraph; + AoeStatus depend_ret = (*aoe_set_dependgraphs_)(session_id_, dependGraph); + build_flag_ = true; + // BuildOutTensorInfo(ctx); + done(); + return; + } else { + ADP_LOG(INFO) << "[GEOP] in tune mode, training graph handled by tools."; + // std::vector ge_graphs; + std::map tuingOptions; + /**OP_REQUIRES_ASYNC(ctx, SessionManager::GetInstance().GetGeGraphs(ge_session_, ge_graphs), + errors::Internal("[GEOP] ge ge session nontraining graphs failed."), done);**/ + tune_options_.insert(graph_options_.begin(), graph_options_.end()); + // AoeStatus tune_ret = (*aoe_tuning_)(ge_graph, ge_graphs, ge_session_, tune_options_); + AoeStatus tune_ret = (*aoe_set_tuninggraph_)(session_id_, ge_graph); + tuingOptions.insert({ge::AscendString("ge.exec.dataInputsShapeRange"), + ge::AscendString(data_inputs_shape_range_.c_str())}); + AoeStatus aoe_tune_ret = (*aoe_tuning_graph_)(session_id_, tuingOptions); + //OP_REQUIRES_ASYNC(ctx, (tune_ret == AOE_SUCCESS) || (tune_ret == AOE_ERROR_NO_AICORE_GRAPH), + // errors::Internal("[GEOP] exec aoe tuning func failed[", tune_ret, "]."), done); + ADP_LOG(INFO) << "[GEOP] aoe success[" << tune_ret << "]."; + build_flag_ = true; + // BuildOutTensorInfo(ctx); + done(); + return; + } + } SetDynamicInput(); // call ge session addGraph api diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h index 08c2350c8d8bb88a88874a39f537ae410dc5f112..a9ed6b197695a636467f5f36aa3a837c56185c8e 100644 --- a/tf_adapter/kernels/geop_npu.h +++ b/tf_adapter/kernels/geop_npu.h @@ -32,10 +32,18 @@ #include namespace tensorflow { +using SessionId = unsigned long; using AoeTuningFunc = AoeStatus (*)(ge::Graph &, std::vector &, ge::Session *, const std::map &); using AoeInitFunc = AoeStatus (*)(ge::Session *, const std::map &); using AoeFinalizeFunc = AoeStatus (*)(); +// TEST +using AoeInitializeFunc = AoeStatus (*)(const std::map &); +using AoeCreateSessionFunc = AoeStatus (*)(const std::map &, SessionId); +using AoeSetGeSessionFunc = AoeStatus (*)(SessionId , ge::Session*); +using AoeSetDependGraphFunc = AoeStatus (*)(SessionId , std::vector&); +using AoeSetTuningGraphFunc = AoeStatus (*)(SessionId , ge::Graph &); +using AoeTuningGraphFunc = AoeStatus (*)(SessionId , const std::map &); class GeOp : public AsyncOpKernel { public: @@ -144,12 +152,21 @@ class GeOp : public AsyncOpKernel { std::string getnext_inputs_shape_range_; bool need_compile_graph_first_; AoeInitFunc aoe_init_; - AoeFinalizeFunc aoe_finalize_; + // AoeFinalizeFunc aoe_finalize_; std::map tune_options_; std::string is_dynamic_getnext_; std::string placeholder_index_; - std::atomic_flag tuned_flag_; + // TEST + SessionId session_id_; + AoeInitializeFunc aoe_initialize_; + AoeFinalizeFunc aoe_finalize_; + AoeCreateSessionFunc aoe_create_session_; + AoeSetGeSessionFunc aoe_set_gesession_; + AoeSetDependGraphFunc aoe_set_dependgraphs_; + AoeSetTuningGraphFunc aoe_set_tuninggraph_; + AoeTuningGraphFunc aoe_tuning_graph_; + }; } // namespace tensorflow #endif // TENSORFLOW_KERNELS_GEOP_NPU_H_ diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 08cf8c8696f565cf645055dda2753b595a4ec826..043fe6ee63e30b6d9e3233428946e195167bbaca 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -90,7 +90,10 @@ class NPURunConfig(run_config_lib.RunConfig): distribute_config=None, modify_mixlist=None, op_precision_mode=None, - device_type="default_device_type" + device_type="default_device_type", + # test + hccl_timeout=0, + operator_timeout=0, ): """ Constructs a NPUConfig. @@ -216,6 +219,9 @@ class NPURunConfig(run_config_lib.RunConfig): self._modify_mixlist = modify_mixlist self._op_precision_mode = op_precision_mode self._device_type = device_type + # test + self.hccl_timeout=hccl_timeout + self.operator_timeout=operator_timeout super(NPURunConfig, self).__init__( model_dir=model_dir, diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index 0d6c58db260d54f21ca84180df74def33fe1b86e..562abb4b7670ab92fd6ee7d658716802dd5d10b1 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -668,6 +668,10 @@ class NPUEstimator(estimator_lib.Estimator): custom_op.parameter_map["is_tailing_optimization"].b = config.is_tailing_optimization custom_op.parameter_map["min_group_size"].b = 1 custom_op.parameter_map["hcom_parallel"].b = config._hcom_parallel + # test + custom_op.parameter_map["hccl_timeout"].b = config.hccl_timeout + custom_op.parameter_map["operator_timeout"].b = config.operator_timeout + if config._graph_memory_max_size is not None: custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(config._graph_memory_max_size)) if config._variable_memory_max_size is not None: diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index ab9b66d4fb55b5191f2ba214f73c06cf24d846fd..6eb26c9279a0482efb90d825e455082671c5ae9b 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -520,6 +520,9 @@ std::map NpuAttrs::GetPassOptions(const GraphOptimizat std::string local_device_list; bool in_out_pair_flag = true; std::string in_out_pair; + // test + int hccl_timeout = 0; + int operator_timeout = 0; for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) { if (custom_optimizer.name() == "NpuOptimizer") { @@ -574,6 +577,9 @@ std::map NpuAttrs::GetPassOptions(const GraphOptimizat } if (params.count("in_out_pair_flag")) { in_out_pair_flag = params.at("in_out_pair_flag").b(); } if (params.count("in_out_pair")) { in_out_pair = params.at("in_out_pair").s(); } + // test + if (params.count("hccl_timeout")) { hccl_timeout = params.at("in_out_pair").i(); } + if (params.count("operator_timeout")) { operator_timeout = params.at("in_out_pair").i(); } } } if (!do_npu_optimizer) { @@ -583,6 +589,7 @@ std::map NpuAttrs::GetPassOptions(const GraphOptimizat mutable_rewrite_options()->set_remapping(RewriterConfig::OFF); } } + // pass options pass_options["do_npu_optimizer"] = std::to_string(do_npu_optimizer); pass_options["enable_dp"] = std::to_string(enable_dp); @@ -599,7 +606,9 @@ std::map NpuAttrs::GetPassOptions(const GraphOptimizat pass_options["local_device_list"] = local_device_list; pass_options["in_out_pair_flag"] = std::to_string(in_out_pair_flag); pass_options["in_out_pair"] = in_out_pair; - + // test + pass_options["hccl_timeout"] = std::to_string(hccl_timeout); + pass_options["operator_timeout"] = std::to_string(operator_timeout); return pass_options; }