From b6b50da342279006b47610da4dc60724c8bbcb8b Mon Sep 17 00:00:00 2001 From: yangyongqiang5033 Date: Fri, 8 Nov 2024 15:13:16 +0800 Subject: [PATCH] add option export_compile_stat --- tf_adapter/interface_spec/api_npu_config.pyh | 2 +- .../npu_bridge/estimator/npu/npu_config.py | 6 ++- .../npu_bridge/estimator/npu/npu_estimator.py | 11 ++++++ .../npu_bridge/estimator/npu/npu_plugin.py | 5 ++- .../tests/st/util/testcase/ge_plugin_test.cc | 8 ++++ .../tests/st/util/testcase/npu_attrs_test.cc | 32 ++++++++++++++++ .../tests/ut/util/testcase/ge_plugin_test.cc | 7 ++++ .../tests/ut/util/testcase/npu_attrs_test.cc | 37 +++++++++++++++++++ tf_adapter/util/ge_plugin.cc | 3 ++ tf_adapter/util/npu_attrs.cc | 23 ++++++++++++ .../npu_device/core/npu_wrapper.cpp | 1 + .../python/npu_device/configs/npu_config.py | 1 + .../tests/stub/include/stub/defines.h | 2 + 13 files changed, 135 insertions(+), 3 deletions(-) diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh index eab27fc3e..911e3df08 100644 --- a/tf_adapter/interface_spec/api_npu_config.pyh +++ b/tf_adapter/interface_spec/api_npu_config.pyh @@ -22,7 +22,7 @@ class NPURunConfig(run_config_lib.RunConfig): event_sync_timeout=-1, external_weight=False, es_cluster_config=None, deterministic=0, frozen_variable=False, variable_placement="Device", jit_compile="auto", precision_mode_v2=None, ac_parallel_enable=None, quant_dumpable=None, input_fusion_size=131072, compile_dynamic_mode=None, - execute_times=-1, graph_max_parallel_model_num=1): + execute_times=-1, graph_max_parallel_model_num=1, export_compile_stat=None): class ProfilingConfig(): def __init__(self, enable_profiling=False, profiling_options=None): diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 9ffaf237f..ead7a73a2 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -116,7 +116,8 @@ class NPURunConfig(run_config_lib.RunConfig): input_fusion_size=131072, compile_dynamic_mode=None, execute_times=-1, - graph_max_parallel_model_num=1 + graph_max_parallel_model_num=1, + export_compile_stat=None ): """ Constructs a NPUConfig. @@ -184,6 +185,8 @@ class NPURunConfig(run_config_lib.RunConfig): jit_compile: Whether enable jit compile input_fusion_size: Merge input memory less than input_fusion_size, defualt 25600B, max size: 32M, min size: 0M precision_mode_v2: default is: ''. + export_compile_stat: configure statistics of the graph compiler, 0: Not Generate; 1: Generated when the program + exits(default); 2: Generated when graph compilation complete. """ # Check iterations_per_loop. @@ -282,6 +285,7 @@ class NPURunConfig(run_config_lib.RunConfig): self._compile_dynamic_mode = compile_dynamic_mode self._graph_max_parallel_model_num = graph_max_parallel_model_num self.execute_times = execute_times + self._export_compile_stat = export_compile_stat super(NPURunConfig, self).__init__( model_dir=model_dir, tf_random_seed=tf_random_seed, diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index 0ea251de4..d18410b3e 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -711,6 +711,15 @@ class NPUEstimator(estimator_lib.Estimator): if config.aoe_config_file is not None: custom_op.parameter_map["aoe_config_file"].s = tf.compat.as_bytes(config.aoe_config_file) + def __load_export_compile_stat(self, config, custom_op): + """Load export_compile_stat config, and add to custom_optimizers + Args: + config: NPURunConfig. + custom_op: Customer optimizers. + """ + if config._export_compile_stat is not None: + custom_op.parameter_map["export_compile_stat"].s = tf.compat.as_bytes(config.export_compile_stat) + def __load_graph_optimizers(self, config): """ Change the session config and load the graph optimizers: @@ -849,6 +858,8 @@ class NPUEstimator(estimator_lib.Estimator): # add experimental config to custom_op self.__load_experimental_config(config, custom_op) + self.__load_export_compile_stat(config, custom_op) + return config def __load_job_info(self, job_start_file): diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py index 60055b205..c7770ae98 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py @@ -72,7 +72,8 @@ def npu_resource_init(graph_run_mode=1, hcom_multi_mode=False, distribute_config=None, aoe_config_file=None, - precision_mode_v2=None): + precision_mode_v2=None, + export_compile_stat=None): """Initialize NPU resource""" util.check_nonnegative_integer(graph_run_mode, "graph_run_mode") check_graph_run_mode(graph_run_mode) @@ -118,6 +119,8 @@ def npu_resource_init(graph_run_mode=1, hcom_multi_mode = util.convert_bool_to_int(hcom_multi_mode) init["ge.hcomMultiMode"] = str(hcom_multi_mode) init["ge.aoe_config_file"] = str(aoe_config_file) + if export_compile_stat is not None: + init["ge.exportCompileStat"] = str(export_compile_stat) init_options = tf_adapter.map_string_string(init) tf_adapter.PluginInit(init_options) diff --git a/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc b/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc index c1828be93..aa1c7355d 100644 --- a/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc +++ b/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc @@ -193,5 +193,13 @@ TEST_F(GePluginTest, RdmaInitAndRegisterOKTest) { int32_t ret = RdmaInitAndRegister(var_info, size); EXPECT_EQ(ret, 0); } + +TEST_F(GePluginTest, PluginInitTest_export_compile_stat) { + std::map init_options; + init_options["ge.exportCompileStat"] = "1"; + PluginInit(init_options); + ASSERT_FALSE(GePlugin::GetInstance()->GetInitOptions().empty()); + NpuClose(); +} } } // end tensorflow diff --git a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc index 1bd8d66d4..435d81602 100644 --- a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc +++ b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc @@ -339,5 +339,37 @@ TEST_F(NpuAttrTest, GetNpuOptimizerAttrCheckDumpStep) { const auto &all_options2 = NpuAttrs::GetAllAttrOptions(attrs2); EXPECT_NE(all_options2.find("dump_step"), all_options2.cend()); } + +TEST_F(NpuAttrTest, SetNpuOptimizerAttr_export_compile_stat) { + GraphOptimizationPassOptions options; + SessionOptions session_options; + session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_do_function_inlining(true); + auto *custom_config = + session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers(); + custom_config->set_name("NpuOptimizer"); + options.session_options = &session_options; + + AttrValue export_compile_stat = AttrValue(); + export_compile_stat.set_s("3"); + (*custom_config->mutable_parameter_map())["export_compile_stat"] = export_compile_stat; + Status s = NpuAttrs::SetNpuOptimizerAttr(options, reinterpret_cast(1)); + EXPECT_EQ(s.ok(), false); +} + +TEST_F(NpuAttrTest, GetAllAttrOptions_export_compile_stat) { + AttrValueMap attr_map; + + AttrValue npu_optimizer = AttrValue(); + npu_optimizer.set_s("NpuOptimizer"); + attr_map["_NpuOptimizer"] = npu_optimizer; + + AttrValue export_compile_stat = AttrValue(); + export_compile_stat.set_s("0"); + attr_map["_export_compile_stat"] = export_compile_stat; + + AttrSlice attrs(&attr_map); + const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs); + EXPECT_NE(all_options.find("export_compile_stat"), all_options.cend()); +} } } // end tensorflow diff --git a/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc b/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc index 4f41877d6..7bc331fde 100644 --- a/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc +++ b/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc @@ -187,5 +187,12 @@ TEST_F(GePluginTest, RdmaInitAndRegisterOKTest) { EXPECT_EQ(ret, 0); } +TEST_F(GePluginTest, PluginInitTest_export_compile_stat) { + std::map init_options; + init_options["ge.exportCompileStat"] = "1"; + PluginInit(init_options); + ASSERT_FALSE(GePlugin::GetInstance()->GetInitOptions().empty()); + NpuClose(); +} } } // end tensorflow diff --git a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc index 0ab666c7d..506936122 100644 --- a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc +++ b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc @@ -474,5 +474,42 @@ TEST_F(NpuAttrTest, GetNpuOptimizerAttrCheckDumpStep) { const auto &all_options2 = NpuAttrs::GetAllAttrOptions(attrs2); ASSERT_TRUE(all_options2.find("dump_step") != all_options2.cend()); } + +TEST_F(NpuAttrTest, SetNpuOptimizerAttr_export_compile_stat) { + GraphOptimizationPassOptions options; + SessionOptions session_options; + session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_do_function_inlining(true); + auto *custom_config = + session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers(); + custom_config->set_name("NpuOptimizer"); + options.session_options = &session_options; + + AttrValue export_compile_stat = AttrValue(); + export_compile_stat.set_s("0"); + (*custom_config->mutable_parameter_map())["export_compile_stat"] = export_compile_stat; + + AttrValue jit_compile = AttrValue(); + jit_compile.set_s("2"); + (*custom_config->mutable_parameter_map())["jit_compile"] = jit_compile; + + Status s = NpuAttrs::SetNpuOptimizerAttr(options, reinterpret_cast(1)); + EXPECT_EQ(s.ok(), false); +} + +TEST_F(NpuAttrTest, GetAllAttrOptions_export_compile_stat) { + AttrValueMap attr_map; + + AttrValue npu_optimizer = AttrValue(); + npu_optimizer.set_s("NpuOptimizer"); + attr_map["_NpuOptimizer"] = npu_optimizer; + + AttrValue export_compile_stat = AttrValue(); + export_compile_stat.set_s("0"); + attr_map["_export_compile_stat"] = export_compile_stat; + + AttrSlice attrs(&attr_map); + const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs); + EXPECT_NE(all_options.find("export_compile_stat"), all_options.cend()); +} } } // end tensorflow diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc index d5dcfc4fc..85d4c3e64 100644 --- a/tf_adapter/util/ge_plugin.cc +++ b/tf_adapter/util/ge_plugin.cc @@ -292,6 +292,9 @@ void GePlugin::Init(std::map &init_options, const bool ADP_LOG(INFO) << "[GePlugin] optypelist_for_implmode :" << init_options[ge::OPTYPELIST_FOR_IMPLMODE]; + if (init_options.find("ge.exportCompileStat") != init_options.end()) { + ADP_LOG(INFO) << "[GePlugin] export_compile_stat : " << init_options["ge.exportCompileStat"]; + } bool tdt_uninit_env = false; (void) ReadBoolFromEnvVar("ASCEND_TDT_UNINIT", false, &tdt_uninit_env); if (!kIsHeterogeneous && !tdt_uninit_env) { diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index 1d619cd59..3fd27a4de 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -688,6 +688,7 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr std::string event_sync_timeout = "-1"; std::string es_cluster_config; std::string execute_times = "-1"; + std::string export_compile_stat; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { (void) ctx->GetAttr("_precision_mode", &precision_mode); @@ -729,6 +730,7 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr (void) ctx->GetAttr("_event_sync_timeout", &event_sync_timeout); (void) ctx->GetAttr("_es_cluster_config", &es_cluster_config); (void) ctx->GetAttr("_execute_times", &execute_times); + (void) ctx->GetAttr("_export_compile_stat", &export_compile_stat); } std::lock_guard lock(mutex_); @@ -783,6 +785,10 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr init_options_["ge.esClusterConfig"] = es_cluster_config; init_options_["execute_times"] = execute_times; init_options_["ge.executeTimes"] = execute_times; + if (!export_compile_stat.empty()) { + init_options_["export_compile_stat"] = export_compile_stat; + init_options_["ge.exportCompileStat"] = export_compile_stat; + } return init_options_; } @@ -1206,6 +1212,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & std::string input_fusion_size; std::string compile_dynamic_mode; std::string execute_times = "-1"; + std::string export_compile_stat; auto NpuOptimizer_value = attrs.Find("_NpuOptimizer"); auto enable_data_pre_proc_value = attrs.Find("_enable_data_pre_proc"); @@ -1302,6 +1309,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & auto input_fusion_size_value = attrs.Find("_input_fusion_size"); auto compile_dynamic_mode_value = attrs.Find("_compile_dynamic_mode"); auto execute_times_value = attrs.Find("_execute_times"); + auto export_compile_stat_value = attrs.Find("_export_compile_stat"); if (NpuOptimizer_value != nullptr) { do_npu_optimizer = "1"; if (enable_data_pre_proc_value != nullptr) { @@ -1611,6 +1619,9 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & if (execute_times_value != nullptr) { execute_times = execute_times_value->s(); } + if (export_compile_stat_value != nullptr) { + export_compile_stat = export_compile_stat_value->s(); + } } all_options["variable_format_optimize"] = variable_format_optimize; @@ -1723,6 +1734,10 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & all_options["compile_dynamic_mode"] = compile_dynamic_mode; all_options["execute_times"] = execute_times; all_options["ge.executeTimes"] = execute_times; + if (!export_compile_stat.empty()) { + all_options["export_compile_stat"] = export_compile_stat; + all_options["ge.exportCompileStat"] = export_compile_stat; + } return all_options; } @@ -1846,6 +1861,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options int64_t input_fusion_size = 131072L; // default 128KB std::string accelerate_train_mode; int32_t execute_times = -1; + std::string export_compile_stat; const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options(); for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) { @@ -2394,6 +2410,13 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options if (params.count("variable_placement") > 0) { variable_location = params.at("variable_placement").s(); } + if (params.count("export_compile_stat") > 0) { + export_compile_stat = params.at("export_compile_stat").s(); + const static std::vector kExportCompileStatList = {"0", "1", "2"}; + NPU_REQUIRES_OK(CheckValueAllowed(export_compile_stat, kExportCompileStatList)); + init_options_["export_compile_stat"] = export_compile_stat; + init_options_["ge.exportCompileStat"] = export_compile_stat; + } if (params.count("jit_compile") > 0) { const static std::vector kJitCompileList = {"true", "false", diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index c22fa20bc..73dec5535 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -93,6 +93,7 @@ const std::map kGlobalConfigOptions = { {"stream_sync_timeout", "stream_sync_timeout"}, {"event_sync_timeout", "event_sync_timeout"}, {"execute_times", "execute_times"}, + {"export_compile_stat", "ge.exportCompileStat"}, // private options {"_distribute.rank_id", ge::OPTION_EXEC_RANK_ID}, {"_distribute.rank_table", ge::OPTION_EXEC_RANK_TABLE_FILE}, diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py index 1371ec002..92dc28d58 100644 --- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py +++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py @@ -79,5 +79,6 @@ class NpuConfig(NpuBaseConfig): self.precision_mode_v2 = OptionValue(None, ['fp16', 'origin', 'cube_fp16in_fp32out', 'mixed_float16', 'mixed_bfloat16']) + self.export_compile_stat = OptionValue(None, ['0', '1', '2']) super(NpuConfig, self).__init__() diff --git a/tf_adapter_2.x/tests/stub/include/stub/defines.h b/tf_adapter_2.x/tests/stub/include/stub/defines.h index 047e4295e..2cbb5507f 100644 --- a/tf_adapter_2.x/tests/stub/include/stub/defines.h +++ b/tf_adapter_2.x/tests/stub/include/stub/defines.h @@ -308,6 +308,8 @@ const std::string MODIFY_MIXLIST = "ge.exec.modify_mixlist"; const std::string OP_PRECISION_MODE = "ge.exec.op_precision_mode"; +const char *const OPTION_EXPORT_COMPILE_STAT = "ge.exportCompileStat"; + // Graph run mode enum GraphRunMode { PREDICTION = 0, TRAIN }; // Topo sorting mode -- Gitee