From 6f86a95e5afdc93c8132209569207c1be0683c86 Mon Sep 17 00:00:00 2001
From: gengchao <gengchao4@huawei.com>
Date: Tue, 24 Dec 2024 15:21:44 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=87=AA=E5=8A=A8=E8=9E=8D?=
 =?UTF-8?q?=E5=90=88=E9=80=89=E9=A1=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_adapter/interface_spec/api_npu_config.pyh  |  2 +-
 tf_adapter/kernels/geop_npu.cc                |  5 ++++-
 tf_adapter/kernels/geop_npu.h                 |  1 +
 .../npu_bridge/estimator/npu/npu_config.py    |  4 +++-
 .../npu_bridge/estimator/npu/npu_estimator.py |  1 +
 tf_adapter/util/ge_plugin.cc                  |  1 +
 tf_adapter/util/npu_attrs.cc                  | 19 +++++++++++++++++++
 .../npu_device/core/npu_wrapper.cpp           |  3 ++-
 .../python/npu_device/configs/npu_config.py   |  2 +-
 9 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh
index cafdb3f24..bf687c385 100644
--- a/tf_adapter/interface_spec/api_npu_config.pyh
+++ b/tf_adapter/interface_spec/api_npu_config.pyh
@@ -22,7 +22,7 @@ class NPURunConfig(run_config_lib.RunConfig):
                 event_sync_timeout=-1, external_weight=False, es_cluster_config=None, deterministic=0,
                 frozen_variable=False, variable_placement="Device", jit_compile="auto", precision_mode_v2=None,
                 ac_parallel_enable=None, quant_dumpable=None, input_fusion_size=131072, compile_dynamic_mode=None,
-                execute_times=-1, graph_max_parallel_model_num=1, export_compile_stat=1):
+                execute_times=-1, graph_max_parallel_model_num=1, export_compile_stat=1, ge_autofuse=False):
 
 class ProfilingConfig():
     def __init__(self, enable_profiling=False, profiling_options=None):
diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc
index 80fa15a3b..bcca9cefe 100644
--- a/tf_adapter/kernels/geop_npu.cc
+++ b/tf_adapter/kernels/geop_npu.cc
@@ -414,6 +414,7 @@ void GeOp::Initialize(OpKernelConstruction *ctx) {
   (void) ctx->GetAttr("_embedding_flags", &embedding_flags_);
   (void) ctx->GetAttr("_dynamic_input", &dynamic_input_);
   (void) ctx->GetAttr("_jit_compile", &jit_compile_);
+  (void) ctx->GetAttr("_ge_autofuse", &ge_autofuse_);
   if (!dynamic_input_.empty() && dynamic_input_ == "1") {
     jit_compile_ = "1";
     is_dynamic_input_ = true;
@@ -435,7 +436,8 @@ void GeOp::Initialize(OpKernelConstruction *ctx) {
                 << ", is_var_init_graph: " << is_var_init_graph_ << ", use_counter_filter: " << use_counter_filter_
                 << ", max_key_num: " << max_key_num_ << ", embedding_dim: " << embedding_dim_
                 << ", padding_key: " << padding_key_ << ", embedding_flags: " << embedding_flags_
-                << ", compile_dynamic_mode: " << compile_dynamic_mode_;
+                << ", compile_dynamic_mode: " << compile_dynamic_mode_
+                << ", ge_autofuse:: " << ge_autofuse_;
 
   // global environment Initialize, invoke once for each process
   std::string sess_config = "";
@@ -1239,6 +1241,7 @@ Status GeOp::SetGraphOptions(OpKernelContext *ctx) {
   graph_options_["ge.jit_compile"] = jit_compile_;
   graph_options_["ge.exec.overflow"] = "1";
   graph_options_["ge.graphLevelSat"] = (mix_compile_mode_ == "0") ? "1" : "0";
+  graph_options_["ge.autofuse"] = ge_autofuse_ ? "true" : "false";
   return DoAccelerateTrain();
 }
 
diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h
index fc44c4022..d0e7c906f 100644
--- a/tf_adapter/kernels/geop_npu.h
+++ b/tf_adapter/kernels/geop_npu.h
@@ -279,6 +279,7 @@ public:
   AccelerateInfo accelerate_info_;
   GraphHandler graph_handler_;
   bool need_compile_graph_first_;
+  bool ge_autofuse_;
 };
 }  // namespace tensorflow
 #endif  // TENSORFLOW_KERNELS_GEOP_NPU_H_
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
index a1789fe25..50000b41d 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
@@ -117,7 +117,8 @@ class NPURunConfig(run_config_lib.RunConfig):
                  compile_dynamic_mode=None,
                  execute_times=-1,
                  graph_max_parallel_model_num=1,
-                 export_compile_stat=1
+                 export_compile_stat=1,
+                 ge_autofuse=False
                  ):
         """
         Constructs a NPUConfig.
@@ -286,6 +287,7 @@ class NPURunConfig(run_config_lib.RunConfig):
         self._graph_max_parallel_model_num = graph_max_parallel_model_num
         self.execute_times = execute_times
         self._export_compile_stat = export_compile_stat
+        self._ge_autofuse = ge_autofuse
         super(NPURunConfig, self).__init__(
             model_dir=model_dir,
             tf_random_seed=tf_random_seed,
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
index 7b372f621..fe7e462d7 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
@@ -821,6 +821,7 @@ class NPUEstimator(estimator_lib.Estimator):
         custom_op.parameter_map["frozen_variable"].b = config._frozen_variable
         custom_op.parameter_map["variable_placement"].s = tf.compat.as_bytes(config._variable_placement)
         custom_op.parameter_map["execute_times"].i = config.execute_times
+        custom_op.parameter_map["ge_autofuse"].b = config._ge_autofuse
 
         self.__load_session_device_id(config, custom_op)
         self.__load_modify_mixlist(config, custom_op)
diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc
index 443743f51..4eb14fce7 100644
--- a/tf_adapter/util/ge_plugin.cc
+++ b/tf_adapter/util/ge_plugin.cc
@@ -127,6 +127,7 @@ void SetOptionNameMap(json &option_name_map) {
   option_name_map.emplace("ge.executeTimes", "execute_times");
   option_name_map.emplace(ge::OPTION_EXEC_DYNAMIC_EXECUTE_MODE, "dynamic_graph_execute_mode");
   option_name_map.emplace(ge::OPTION_EXEC_DYNAMIC_INPUT, "dynamic_input");
+  option_name_map.emplace("ge.autofuse", "ge_autofuse");
 }
 }  // namespace
 
diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc
index d89e01ebc..765277a87 100644
--- a/tf_adapter/util/npu_attrs.cc
+++ b/tf_adapter/util/npu_attrs.cc
@@ -689,6 +689,7 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
   std::string es_cluster_config;
   std::string execute_times = "-1";
   std::string export_compile_stat;
+  std::string ge_autofuse = "0";
 
   if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) {
     (void) ctx->GetAttr("_precision_mode", &precision_mode);
@@ -731,6 +732,7 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
     (void) ctx->GetAttr("_es_cluster_config", &es_cluster_config);
     (void) ctx->GetAttr("_execute_times", &execute_times);
     (void) ctx->GetAttr("_export_compile_stat", &export_compile_stat);
+    (void) ctx->GetAttr("_ge_autofuse", &ge_autofuse);
   }
 
   std::lock_guard<std::mutex> lock(mutex_);
@@ -785,6 +787,8 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
   init_options_["ge.esClusterConfig"] = es_cluster_config;
   init_options_["execute_times"] = execute_times;
   init_options_["ge.executeTimes"] = execute_times;
+  init_options_["ge.autofuse"] = ge_autofuse;
+  init_options_["ge_autofuse"] = ge_autofuse;
   if (!export_compile_stat.empty()) {
     init_options_["export_compile_stat"] = export_compile_stat;
     init_options_["ge.exportCompileStat"] = export_compile_stat;
@@ -1213,6 +1217,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   std::string compile_dynamic_mode;
   std::string execute_times = "-1";
   std::string export_compile_stat;
+  std::string ge_autofuse="0";
 
   auto NpuOptimizer_value = attrs.Find("_NpuOptimizer");
   auto enable_data_pre_proc_value = attrs.Find("_enable_data_pre_proc");
@@ -1310,6 +1315,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   auto compile_dynamic_mode_value = attrs.Find("_compile_dynamic_mode");
   auto execute_times_value = attrs.Find("_execute_times");
   auto export_compile_stat_value = attrs.Find("_export_compile_stat");
+  auto ge_autofuse_value = attrs.Find("_ge_autofuse");
   if (NpuOptimizer_value != nullptr) {
     do_npu_optimizer = "1";
     if (enable_data_pre_proc_value != nullptr) {
@@ -1622,6 +1628,9 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
     if (export_compile_stat_value != nullptr) {
       export_compile_stat = export_compile_stat_value->s();
     }
+    if (ge_autofuse_value != nullptr) {
+      ge_autofuse = ge_autofuse_value->s();
+    }
   }
 
   all_options["variable_format_optimize"] = variable_format_optimize;
@@ -1734,6 +1743,8 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   all_options["compile_dynamic_mode"] = compile_dynamic_mode;
   all_options["execute_times"] = execute_times;
   all_options["ge.executeTimes"] = execute_times;
+  all_options["ge.autofuse"] = ge_autofuse;
+  all_options["ge_autofuse"] = ge_autofuse;
   if (!export_compile_stat.empty()) {
     all_options["export_compile_stat"] = export_compile_stat;
     all_options["ge.exportCompileStat"] = export_compile_stat;
@@ -1862,6 +1873,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   std::string accelerate_train_mode;
   int32_t execute_times = -1;
   int32_t export_compile_stat = 1;
+  bool ge_autofuse = false;
 
   const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options();
   for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) {
@@ -2404,6 +2416,10 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
       if (params.count("execute_times") > 0) {
         execute_times = params.at("execute_times").i();
       }
+
+      if (params.count("ge_autofuse") > 0) {
+        ge_autofuse = params.at("ge_autofuse").b();
+      }
       if (params.count("frozen_variable") > 0) {
         frozen_variable = params.at("frozen_variable").b();
       }
@@ -2563,6 +2579,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   init_options_["ge.esClusterConfig"] = es_cluster_config;
   init_options_["execute_times"] = std::to_string(execute_times);
   init_options_["ge.executeTimes"] = std::to_string(execute_times);
+
+  init_options_["ge_autofuse"] = std::to_string(ge_autofuse);
+  init_options_["ge.autofuse"] = std::to_string(ge_autofuse);
   for (const auto &option : init_options_) {
     std::string attr_name = std::string("_") + option.first;
     node->AddAttr(attr_name, option.second);
diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
index 73dec5535..567179770 100644
--- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
+++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
@@ -102,7 +102,8 @@ const std::map<std::string, std::string> kGlobalConfigOptions = {
   {"_distribute.cm_chief_port", ge::OPTION_EXEC_CM_CHIEF_PORT},
   {"_distribute.cm_chief_worker_device", ge::OPTION_EXEC_CM_CHIEF_DEVICE},
   {"_distribute.cm_worker_ip", ge::OPTION_EXEC_CM_WORKER_IP},
-  {"_distribute.cm_worker_size", ge::OPTION_EXEC_CM_WORKER_SIZE}
+  {"_distribute.cm_worker_size", ge::OPTION_EXEC_CM_WORKER_SIZE},
+  {"ge_autofuse", "ge.autofuse"}
 };
 
 const std::map<std::string, std::string> kSessionConfigOptions = {
diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py
index 21abadeca..eb853a8dd 100644
--- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py
+++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py
@@ -80,5 +80,5 @@ class NpuConfig(NpuBaseConfig):
                                              ['fp16', 'origin', 'cube_fp16in_fp32out', 'mixed_float16',
                                               'mixed_bfloat16', 'cube_hif8', 'mixed_hif8'])
         self.export_compile_stat = OptionValue(1, [0, 1, 2])
-
+        self.ge_autofuse = OptionValue(False, [True, False])
         super(NpuConfig, self).__init__()
-- 
Gitee