diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 3cd4f55b2ff752d00c53ec64ad1be0ba040aa14b..50fde173b18ba97689607e7ad4f1eef358012aff 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -110,7 +110,8 @@ class NPURunConfig(run_config_lib.RunConfig): frozen_variable=False, variable_placement="Device", jit_compile="auto", - precision_mode_v2=None + precision_mode_v2=None, + ac_parallel_enable=None ): """ Constructs a NPUConfig. @@ -144,6 +145,7 @@ class NPURunConfig(run_config_lib.RunConfig): dump_config: The dump configuration. stream_max_parallel_num: Specify the degree of parallelism of the AICPU / AICORE engine to achieve parallel execution between AICPU / AICORE operators. + ac_parallel_enable: Enable engines such as Aicpu to parallel with other engines in dynamic shape graphs. op_select_implmode: Selecting whether the operator is implemented with high_precision or high_performance or high_precision_for_all or high_performance_for_all. optypelist_for_implmode: Operator list. @@ -211,6 +213,7 @@ class NPURunConfig(run_config_lib.RunConfig): self._dump_config = self._get_dump_config(dump_config) self._stream_max_parallel_num = stream_max_parallel_num + self._ac_parallel_enable = ac_parallel_enable self.horovod_mode = self._get_horovod_mode(horovod_mode) util.check_nonnegative_integer(graph_run_mode, "graph_run_mode") diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index f66fd572212d3cd2348548758a61b9aa5b3001d3..c32e72bfaf40b1d8e67eac067a024238003b73dd 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -631,6 +631,15 @@ class NPUEstimator(estimator_lib.Estimator): if config._stream_max_parallel_num is not None: custom_op.parameter_map["stream_max_parallel_num"].s = tf.compat.as_bytes(config._stream_max_parallel_num) + def __load_ac_parallel_enable_config(self, config, custom_op): + """Load ac_parallel_enable config, and add to custom_optimizers + Args: + config: NPURunConfig. + custom_op: Customer optimizers. + """ + if config._ac_parallel_enable is not None: + custom_op.parameter_map["ac_parallel_enable"].s = tf.compat.as_bytes(config._ac_parallel_enable) + def __load_ps_mode_config(self, custom_op): """Load stream_max_parallel_num config ,and add to custom_optimizers Args: @@ -809,6 +818,8 @@ class NPUEstimator(estimator_lib.Estimator): # add stream_max_parallel to custom_op self.__load_stream_max_config(config, custom_op) + self.__load_ac_parallel_enable_config(config, custom_op) + self.__load_ps_mode_config(custom_op) self._load_op_performance_config(config, custom_op) diff --git a/tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt b/tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt index 3fe48905e8cc7190c2c3dc1ddbc2cb2327ebb604..3af7fa793b547a6b466dfc67e80fc918b2d0fcbb 100644 --- a/tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt +++ b/tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt @@ -128,6 +128,12 @@ node { s: "1" } } + attr { + key: "_ac_parallel_enable" + value { + s: "0" + } + } attr { key: "_is_tailing_optimization" value { diff --git a/tf_adapter/tests/ut/optimizers/pbtxt/om_test_build_geop.pbtxt b/tf_adapter/tests/ut/optimizers/pbtxt/om_test_build_geop.pbtxt index 3fe48905e8cc7190c2c3dc1ddbc2cb2327ebb604..3af7fa793b547a6b466dfc67e80fc918b2d0fcbb 100644 --- a/tf_adapter/tests/ut/optimizers/pbtxt/om_test_build_geop.pbtxt +++ b/tf_adapter/tests/ut/optimizers/pbtxt/om_test_build_geop.pbtxt @@ -128,6 +128,12 @@ node { s: "1" } } + attr { + key: "_ac_parallel_enable" + value { + s: "0" + } + } attr { key: "_is_tailing_optimization" value { diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc index 75389659fbf951bb9ebc031a5aedf91695f5dd07..eeb2eefd0ba2b81dcc98c53ead0e799f07052013 100644 --- a/tf_adapter/util/ge_plugin.cc +++ b/tf_adapter/util/ge_plugin.cc @@ -76,6 +76,7 @@ void SetOptionNameMap(json &option_name_map) { option_name_map.emplace(ge::OP_COMPILER_CACHE_MODE, "op_compiler_cache_mode"); option_name_map.emplace(ge::OP_COMPILER_CACHE_DIR, "op_compiler_cache_dir"); option_name_map.emplace(ge::STREAM_MAX_PARALLEL_NUM, "stream_max_parallel_num"); + option_name_map.emplace(ge::AC_PARALLEL_ENABLE, "ac_parallel_enable"); option_name_map.emplace(ge::HCOM_PARALLEL, "hcom_parallel"); option_name_map.emplace(ge::HCOM_MULTI_MODE, "hcom_multi_mode"); option_name_map.emplace(ge::OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION, "is_tailing_optimization"); diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index fe9f1c3b480f44b87748151fa8c304bd874879ec..72dba474beb308bb0092353b29d936ec1326589d 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -436,6 +436,7 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr std::string dump_debug_mode = "all"; std::string dump_layer; std::string stream_max_parallel_num; + std::string ac_parallel_enable; std::string npuOptimizer; std::string is_tailing_optimization = "0"; std::string op_select_implmode; @@ -501,6 +502,7 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr } } (void) ctx->GetAttr("_stream_max_parallel_num", &stream_max_parallel_num); + (void) ctx->GetAttr("_ac_parallel_enable", &ac_parallel_enable); (void) ctx->GetAttr("_is_tailing_optimization", &is_tailing_optimization); (void) ctx->GetAttr("_op_select_implmode", &op_select_implmode); (void) ctx->GetAttr("_optypelist_for_implmode", &optypelist_for_implmode); @@ -542,6 +544,7 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr sess_options["ge.exec.variable_acc"] = variable_format_optimize; sess_options[ge::HCOM_PARALLEL] = hcom_parallel; sess_options[ge::STREAM_MAX_PARALLEL_NUM] = stream_max_parallel_num; + sess_options[ge::AC_PARALLEL_ENABLE] = ac_parallel_enable; if (!graph_memory_max_size.empty()) { sess_options[ge::GRAPH_MEMORY_MAX_SIZE] = graph_memory_max_size; } @@ -1098,6 +1101,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & std::string dump_data = "tensor"; std::string dump_layer; std::string stream_max_parallel_num; + std::string ac_parallel_enable; std::string soc_config; std::string is_tailing_optimization = "0"; @@ -1187,6 +1191,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & auto dump_layer_value = attrs.Find("_dump_layer"); auto dump_debug_mode_value = attrs.Find("_dump_debug_mode"); auto stream_max_parallel_num_value = attrs.Find("_stream_max_parallel_num"); + auto ac_parallel_enable_value = attrs.Find("_ac_parallel_enable"); auto soc_config_value = attrs.Find("_soc_config"); auto graph_slice_value = attrs.Find("_graph_slice"); @@ -1348,6 +1353,9 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & if (stream_max_parallel_num_value != nullptr) { stream_max_parallel_num = stream_max_parallel_num_value->s(); } + if (ac_parallel_enable_value != nullptr) { + ac_parallel_enable = ac_parallel_enable_value->s(); + } if (graph_slice_value != nullptr) { graph_slice_mode = graph_slice_value->s(); } @@ -1544,6 +1552,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & all_options["variable_format_optimize"] = variable_format_optimize; all_options["hcom_parallel"] = hcom_parallel; all_options["stream_max_parallel_num"] = stream_max_parallel_num; + all_options["ac_parallel_enable"] = ac_parallel_enable; if (!graph_memory_max_size.empty()) { all_options["graph_memory_max_size"] = graph_memory_max_size; } @@ -1683,6 +1692,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options std::string dump_data = "tensor"; std::string dump_layer; std::string stream_max_parallel_num; + std::string ac_parallel_enable; std::string soc_config; std::string hccl_timeout; std::string HCCL_algorithm; @@ -1832,6 +1842,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options if (params.count("stream_max_parallel_num") > 0) { stream_max_parallel_num = params.at("stream_max_parallel_num").s(); } + if (params.count("ac_parallel_enable") > 0) { + ac_parallel_enable = params.at("ac_parallel_enable").s(); + } if (params.count("is_tailing_optimization") > 0) { is_tailing_optimization = params.at("is_tailing_optimization").b(); @@ -2287,6 +2300,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options sess_options["graph_slice"] = graph_slice_mode; sess_options["hcom_parallel"] = std::to_string(static_cast(hcom_parallel)); sess_options["stream_max_parallel_num"] = stream_max_parallel_num; + sess_options["ac_parallel_enable"] = ac_parallel_enable; if (!graph_memory_max_size.empty()) { sess_options["graph_memory_max_size"] = graph_memory_max_size; } diff --git a/tf_adapter/util/session_manager.cc b/tf_adapter/util/session_manager.cc index a9fb0481f4da09e83bf4ef3176a8805703277eee..b5b28037731c7a1dcb391d65b5764caf428f4125 100644 --- a/tf_adapter/util/session_manager.cc +++ b/tf_adapter/util/session_manager.cc @@ -107,6 +107,8 @@ void SessionManager::PrintGeSessionOptions(std::map &s // stream max parallel num ADP_LOG(INFO) << "[GEOP] stream_max_parallel_num :" << sess_options[ge::STREAM_MAX_PARALLEL_NUM]; + // ac parallel enable + ADP_LOG(INFO) << "[GEOP] ac_parallel_enable :" << sess_options[ge::AC_PARALLEL_ENABLE]; // graph memory configuration if (!sess_options[ge::GRAPH_MEMORY_MAX_SIZE].empty()) { diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index ce930125ed1425140725d9fbf229470dba22496c..3aa311d512a96afef7d75df7d2ec12a89de14e3d 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -73,6 +73,7 @@ const std::map kConfigurableOptions = { {"op_compiler_cache_mode", ge::OP_COMPILER_CACHE_MODE}, {"op_compiler_cache_dir", ge::OP_COMPILER_CACHE_DIR}, {"stream_max_parallel_num", ge::STREAM_MAX_PARALLEL_NUM}, + {"ac_parallel_enable", ge::AC_PARALLEL_ENABLE}, {"hcom_parallel", ge::HCOM_PARALLEL}, {"hcom_multi_mode", ge::HCOM_MULTI_MODE}, {"is_tailing_optimization", ge::OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION}, diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py index 2015bab1de4ee2748f6990ef9bd9f5b9d9d79e5b..bb2df2e1b09e64020a8ebab8541f014285a8ce1a 100644 --- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py +++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py @@ -45,6 +45,7 @@ class NpuConfig(NpuBaseConfig): self.op_compiler_cache_mode = OptionValue('enable', ['enable', 'disable', 'force']) self.op_compiler_cache_dir = OptionValue(None, None) self.stream_max_parallel_num = OptionValue(None, None) + self.ac_parallel_enable = OptionValue(None, ['0', '1']) self.hcom_parallel = OptionValue(True, [True, False]) self.hcom_multi_mode = OptionValue(None, None) self.is_tailing_optimization = OptionValue(False, [True, False]) diff --git a/tf_adapter_2.x/tests/stub/include/stub/defines.h b/tf_adapter_2.x/tests/stub/include/stub/defines.h index 1e35a910185186b67e7e441e63aa7a0e8c2ac75f..b5fbeb0e0b42f4060c49c3a5ccaa5b2177c1eac8 100644 --- a/tf_adapter_2.x/tests/stub/include/stub/defines.h +++ b/tf_adapter_2.x/tests/stub/include/stub/defines.h @@ -168,6 +168,10 @@ const std::string GE_FE_FLAG = "ge.feFlag"; // this option is to obtain stream max parallel num const std::string STREAM_MAX_PARALLEL_NUM = "ge.streamMaxParallelNum"; +// Configure engines such as Aicpu to compute parallelly with other engines in dynamic shape graphs. +// its value should be "0" or "1", default value is "0" +const std::string AC_PARALLEL_ENABLE = "ac_parallel_enable"; + // congigure outputDatatype to setting net output type const std::string OUTPUT_DATATYPE = "ge.outputDatatype";