diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index 82c27e49f9a25c5c8b6da8ec6d02dec0f8cfaa36..bd66ca6a263804b61c6f92b127eeaba5a6f45d60 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -2125,8 +2125,11 @@ int GeOp::RunTuning(std::vector &input_vec, std::vector &inp SetDynamicInput(); graph_options_["ge.exec.overflow"] = "1"; graph_options_["ge.graphLevelSat"] = (mix_compile_mode_ == "0") ? "1" : "0"; - // run aoe tuning + return ExecuteAoeTuning(ge_graph, is_allreduce, inputs); +} + +int GeOp::ExecuteAoeTuning(ge::Graph &ge_graph, bool is_allreduce, std::vector &inputs) { if ((init_options_["ge.jobType"] == "1") || (init_options_["ge.jobType"] == "2") || ((init_options_["ge.jobType"] == "4") && is_allreduce)) { std::function callback = [this]() { diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h index 7fc55199ccf494e682c92d4f1c4dd112212c25b7..c0a0b8028174e20d32c6ec29079a291be91ed3b9 100644 --- a/tf_adapter/kernels/geop_npu.h +++ b/tf_adapter/kernels/geop_npu.h @@ -170,6 +170,8 @@ public: int RunTuning(std::vector &input_vec, std::vector &inputs, const OpKernelContext *const ctx); + int ExecuteAoeTuning(ge::Graph &ge_graph, bool is_allreduce, std::vector &inputs); + std::string BuildSubGraph(FunctionLibraryDefinition *flib_def, const std::string &graph); void SetDynamicInput(); diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index 781677d43805a8bec4853802e97888fad451e551..6ee28bafe327e9d92f0c28be527ee9f4766fdb79 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -516,7 +516,8 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr std::string aicore_num; std::string all_tensor_not_empty; std::string auto_multistream_parallel_mode; - if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { + const bool is_npu_optimizer_valid = (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()); + if (is_npu_optimizer_valid) { (void) ctx->GetAttr("_variable_format_optimize", &variable_format_optimize); (void) ctx->GetAttr("_hcom_parallel", &hcom_parallel); (void) ctx->GetAttr("_graph_memory_max_size", &graph_memory_max_size); @@ -524,12 +525,13 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr (void) ctx->GetAttr("_enable_dump", &enable_dump); (void) ctx->GetAttr("_enable_dump_debug", &enable_dump_debug); (void) ctx->GetAttr("_input_fusion_size", &input_fusion_size); - - if (enable_dump != "0" || enable_dump_debug != "0") { + const bool need_dump_path = enable_dump != "0" || enable_dump_debug != "0"; + if (need_dump_path) { (void) ctx->GetAttr("_dump_path", &dump_path); } if (enable_dump != "0") { - if (ctx->GetAttr("_dump_step", &dump_step) == Status::OK() && !dump_step.empty()) { + const bool is_valid_dump_step = ctx->GetAttr("_dump_step", &dump_step) == Status::OK() && !dump_step.empty(); + if (is_valid_dump_step) { Status s = checkDumpStep(dump_step); if (!s.ok()) { ADP_LOG(FATAL) << s.error_message(); @@ -622,8 +624,9 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr sess_options[ge::OPTYPELIST_FOR_IMPLMODE] = optypelist_for_implmode; sess_options[ge::GRAPH_MAX_PARALLEL_MODEL_NUM] = graph_max_parallel_model_num; // 如果compile_dynamic_mode为0, jit_compile=1, shape_generalization_mode!=STRICT, 需要将ge.compile_dynamic_mode设置为1 - if ((compile_dynamic_mode == "0" && jit_compile != "1") || - (compile_dynamic_mode == "0" && jit_compile == "1" && shape_generalization_mode != "STRICT")) { + const bool need_set_1 = (compile_dynamic_mode == "0" && jit_compile != "1") || + (compile_dynamic_mode == "0" && jit_compile == "1" && shape_generalization_mode != "STRICT"); + if (need_set_1) { sess_options["ge.compile_dynamic_mode"] = "1"; } else { sess_options["ge.compile_dynamic_mode"] = compile_dynamic_mode; diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp index 805f86960848bc0fd18993dc1c35ec2ff685d71e..a63aa3e842922edabf2ca5dcc6e1cceb22fb5d69 100644 --- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp +++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp @@ -438,10 +438,12 @@ tensorflow::Status NodePlacer::BuildConcreteCluster() { if (IsNodePlacedOn(node, Placement::CPU)) { continue; } - if (std::any_of(node->out_edges().begin(), node->out_edges().end(), - [this](const tensorflow::Edge *edge) { return !IsSupportedNpuBound(*edge); }) && - std::all_of(node->in_edges().begin(), node->in_edges().end(), - [this](const tensorflow::Edge *edge) { return IsSupportedNpuBound(*edge); })) { + const bool is_start_node = + std::any_of(node->out_edges().begin(), node->out_edges().end(), + [this](const tensorflow::Edge *edge) { return !IsSupportedNpuBound(*edge); }) && + std::all_of(node->in_edges().begin(), node->in_edges().end(), + [this](const tensorflow::Edge *edge) { return IsSupportedNpuBound(*edge); }); + if (is_start_node) { DLOG() << "Need concrete for start node " << node->name(); starts.push_back(node); } @@ -468,7 +470,8 @@ tensorflow::Status NodePlacer::BuildConcreteCluster() { std::queue> q; for (auto &node : cluster->nodes) { auto iter = concrete_clusters_.find(node); - if ((iter != concrete_clusters_.end()) && (iter->second != cluster)) { + const bool need_push = (iter != concrete_clusters_.end()) && (iter->second != cluster); + if (need_push) { q.push(iter->second); } } @@ -483,7 +486,8 @@ tensorflow::Status NodePlacer::BuildConcreteCluster() { for (auto &node : path_cluster->nodes) { (void)cluster->Merge(node); auto iter = concrete_clusters_.find(node); - if (iter != concrete_clusters_.end() && iter->second != path_cluster) { + const bool need_push = iter != concrete_clusters_.end() && iter->second != path_cluster; + if (need_push) { q.push(iter->second); } } diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp index 6cb0a2bc50730936039a863ad82f596033690ea1..6b6c6eaa2a5e6f7044170c3d08664c3c65755e61 100644 --- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp +++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/npu_trans_resource_input_to_node_optimizer.cpp @@ -289,6 +289,26 @@ tensorflow::Status TransResourceInput2Node(TFE_Context *context, tensorflow::Gra return tensorflow::Status::OK(); } + +void GetRetvalNodes(tensorflow::Graph *mutable_graph, std::map &indexed_retvals) { + for (auto node : mutable_graph->op_nodes()) { + if (!node->IsRetval()) { + continue; + } + indexed_retvals[node->attrs().Find("index")->i()] = node; + } +} + +void GetConsumeInAndProduceOut(npu::NpuMutableConcreteGraph *graph, std::set &consumed_inputs, + std::set &produced_outputs) { + for (auto node : graph->Graph()->nodes()) { + if (node->IsArg()) { + (void)consumed_inputs.insert(node->attrs().Find("index")->i()); + } else if (node->IsRetval()) { + (void)produced_outputs.insert(node->attrs().Find("index")->i()); + } + } +} } // namespace namespace npu { @@ -308,12 +328,7 @@ tensorflow::Status TransResourceInput2NodeOptimize(TFE_Context *context, NpuMuta std::map bypass_outputs; std::map indexed_retvals; - for (auto node : mutable_graph->op_nodes()) { - if (!node->IsRetval()) { - continue; - } - indexed_retvals[node->attrs().Find("index")->i()] = node; - } + GetRetvalNodes(mutable_graph, indexed_retvals); for (auto item : indexed_retvals) { const tensorflow::Edge *edge; @@ -394,13 +409,7 @@ tensorflow::Status TransResourceInput2NodeOptimize(TFE_Context *context, NpuMuta std::set consumed_inputs; std::set produced_outputs; - for (auto node : graph->Graph()->nodes()) { - if (node->IsArg()) { - (void)consumed_inputs.insert(node->attrs().Find("index")->i()); - } else if (node->IsRetval()) { - (void)produced_outputs.insert(node->attrs().Find("index")->i()); - } - } + GetConsumeInAndProduceOut(graph, consumed_inputs, produced_outputs); FixGraphArgRetvalIndex(*mutable_graph);