From d942d1b8b87fa941b116b32ad387f70b5e904c25 Mon Sep 17 00:00:00 2001 From: ZhouChen Date: Wed, 30 Apr 2025 09:21:43 +0000 Subject: [PATCH 1/2] !2959 provide the switch to enbale batch mem cpy Merge pull request !2959 from ZhouChen/batch_h2d --- tf_adapter/interface_spec/api_npu_config.pyh | 2 +- tf_adapter/interface_spec/api_npu_plugin.pyh | 3 +- .../npu_bridge/estimator/npu/npu_config.py | 5 ++- .../npu_bridge/estimator/npu/npu_estimator.py | 11 ++++++ .../npu_bridge/estimator/npu/npu_plugin.py | 7 +++- .../tests/st/util/testcase/ge_plugin_test.cc | 8 ++++ .../tests/st/util/testcase/npu_attrs_test.cc | 37 +++++++++++++++++++ .../tests/ut/util/testcase/ge_plugin_test.cc | 8 ++++ .../tests/ut/util/testcase/npu_attrs_test.cc | 36 ++++++++++++++++++ tf_adapter/util/ge_plugin.cc | 5 +++ tf_adapter/util/npu_attrs.cc | 27 +++++++++++++- .../npu_device/core/npu_wrapper.cpp | 1 + .../python/npu_device/configs/npu_config.py | 1 + 13 files changed, 146 insertions(+), 5 deletions(-) diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh index 21be81cae..f8dbb9460 100644 --- a/tf_adapter/interface_spec/api_npu_config.pyh +++ b/tf_adapter/interface_spec/api_npu_config.pyh @@ -23,7 +23,7 @@ class NPURunConfig(run_config_lib.RunConfig): frozen_variable=False, variable_placement="Device", jit_compile="auto", precision_mode_v2=None, ac_parallel_enable=None, quant_dumpable=None, input_fusion_size=131072, compile_dynamic_mode=None, execute_times=-1, graph_max_parallel_model_num=1, export_compile_stat=1, aicore_num=None, - oo_constant_folding=True): + oo_constant_folding=True, input_batch_cpy=False): class ProfilingConfig(): def __init__(self, enable_profiling=False, profiling_options=None): diff --git a/tf_adapter/interface_spec/api_npu_plugin.pyh b/tf_adapter/interface_spec/api_npu_plugin.pyh index a65a3272e..2cd0a1608 100644 --- a/tf_adapter/interface_spec/api_npu_plugin.pyh +++ b/tf_adapter/interface_spec/api_npu_plugin.pyh @@ -6,6 +6,7 @@ def npu_resource_init(graph_run_mode=1, op_debug_level=0, enable_profiling=False enable_exception_dump=2, aoe_mode=None, work_path=None, op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, distribute_config=None, aoe_config_file=None, - precision_mode_v2=None, export_compile_stat=1, aicore_num=None, oo_constant_folding=True): + precision_mode_v2=None, export_compile_stat=1, aicore_num=None, oo_constant_folding=True, + input_batch_cpy=False): def npu_resource_shutdown(): diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 04b63e1ab..6068bc4d5 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -119,7 +119,8 @@ class NPURunConfig(run_config_lib.RunConfig): graph_max_parallel_model_num=1, export_compile_stat=1, aicore_num=None, - oo_constant_folding=True + oo_constant_folding=True, + input_batch_cpy=False ): """ Constructs a NPUConfig. @@ -191,6 +192,7 @@ class NPURunConfig(run_config_lib.RunConfig): aicore_num: default is: ''. exits (default); 2: Generated when graph compilation complete. oo_constant_folding: The switch of constant folding, False: disable; True(default): enable. + input_batch_cpy: The switch of batch mem copy, False: disable; True(default): enable. """ # Check iterations_per_loop. @@ -292,6 +294,7 @@ class NPURunConfig(run_config_lib.RunConfig): self._export_compile_stat = export_compile_stat self._aicore_num = aicore_num self._oo_constant_folding = oo_constant_folding + self._input_batch_cpy = input_batch_cpy super(NPURunConfig, self).__init__( model_dir=model_dir, tf_random_seed=tf_random_seed, diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index 737f97c03..03031bde9 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -741,6 +741,15 @@ class NPUEstimator(estimator_lib.Estimator): if config._oo_constant_folding is not None: custom_op.parameter_map["oo_constant_folding"].b = config._oo_constant_folding + def __load_input_batch_cpy(self, config, custom_op): + """Load input_batch_cpy config, and add to custom_optimizers + Args: + config: NPURunConfig. + custom_op: Customer optimizers. + """ + if config._input_batch_cpy is not None: + custom_op.parameter_map["input_batch_cpy"].b = config._input_batch_cpy + def __load_graph_optimizers(self, config): """ Change the session config and load the graph optimizers: @@ -885,6 +894,8 @@ class NPUEstimator(estimator_lib.Estimator): self.__oo_constant_folding(config, custom_op) + self.__load_input_batch_cpy(config, custom_op) + return config def __load_job_info(self, job_start_file): diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py index bba68219b..97571455e 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py @@ -75,7 +75,8 @@ def npu_resource_init(graph_run_mode=1, precision_mode_v2=None, export_compile_stat=1, aicore_num=None, - oo_constant_folding=True): + oo_constant_folding=True, + input_batch_cpy=False): """Initialize NPU resource""" util.check_nonnegative_integer(graph_run_mode, "graph_run_mode") check_graph_run_mode(graph_run_mode) @@ -128,6 +129,10 @@ def npu_resource_init(graph_run_mode=1, if oo_constant_folding is not None: util.check_bool_type(oo_constant_folding, "oo_constant_folding") init["ge.oo.constantFolding"] = "true" if oo_constant_folding is True else "false" + # input_batch_cpy + if input_batch_cpy is not None: + util.check_bool_type(input_batch_cpy, "input_batch_cpy") + init["ge.inputBatchCpy"] = "true" if input_batch_cpy is True else "false" init_options = tf_adapter.map_string_string(init) tf_adapter.PluginInit(init_options) diff --git a/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc b/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc index 144e0b9d8..fbb165193 100644 --- a/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc +++ b/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc @@ -221,5 +221,13 @@ TEST_F(GePluginTest, PluginInitTest_oo_constant_folding) { ASSERT_FALSE(GePlugin::GetInstance()->GetInitOptions().empty()); NpuClose(); } + +TEST_F(GePluginTest, PluginInitTest_input_batch_cpy) { + std::map init_options; + init_options["ge.inputBatchCpy"] = "true"; + PluginInit(init_options); + ASSERT_FALSE(GePlugin::GetInstance()->GetInitOptions().empty()); + NpuClose(); +} } } // end tensorflow diff --git a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc index 032af0610..4f20143f4 100644 --- a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc +++ b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc @@ -460,5 +460,42 @@ TEST_F(NpuAttrTest, GetAllAttrOptions_oo_constant_folding) { const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs); EXPECT_NE(all_options.find("oo_constant_folding"), all_options.cend()); } + +TEST_F(NpuAttrTest, SetNpuOptimizerAttr_input_batch_cpy) { + GraphOptimizationPassOptions options; + SessionOptions session_options; + session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_do_function_inlining(true); + auto *custom_config = + session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers(); + custom_config->set_name("NpuOptimizer"); + options.session_options = &session_options; + + AttrValue input_batch_cpy = AttrValue(); + input_batch_cpy.set_b(true); + (*custom_config->mutable_parameter_map())["input_batch_cpy"] = input_batch_cpy; + + AttrValue jit_compile = AttrValue(); + jit_compile.set_s("2"); + (*custom_config->mutable_parameter_map())["jit_compile"] = jit_compile; + Status s = NpuAttrs::SetNpuOptimizerAttr(options, reinterpret_cast(1)); + EXPECT_EQ(s.ok(), false); +} + +TEST_F(NpuAttrTest, GetAllAttrOptions_input_batch_cpy) { + AttrValueMap attr_map; + + AttrValue npu_optimizer = AttrValue(); + npu_optimizer.set_s("NpuOptimizer"); + attr_map["_NpuOptimizer"] = npu_optimizer; + + AttrValue input_batch_cpy = AttrValue(); + input_batch_cpy.set_s("true"); + attr_map["_input_batch_cpy"] = input_batch_cpy; + + AttrSlice attrs(&attr_map); + const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs); + EXPECT_NE(all_options.find("ge.inputBatchCpy"), all_options.cend()); +} + } } // end tensorflow diff --git a/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc b/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc index 2a54c6e05..fac80fc57 100644 --- a/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc +++ b/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc @@ -211,5 +211,13 @@ TEST_F(GePluginTest, PluginInitTest_oo_constant_folding) { ASSERT_FALSE(GePlugin::GetInstance()->GetInitOptions().empty()); NpuClose(); } + +TEST_F(GePluginTest, PluginInitTest_input_batch_cpy) { + std::map init_options; + init_options["ge.inputBatchCpy"] = "true"; + PluginInit(init_options); + ASSERT_FALSE(GePlugin::GetInstance()->GetInitOptions().empty()); + NpuClose(); +} } } // end tensorflow diff --git a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc index 0f4001651..85ba473ef 100644 --- a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc +++ b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc @@ -599,5 +599,41 @@ TEST_F(NpuAttrTest, GetAllAttrOptions_oo_constant_folding) { const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs); EXPECT_NE(all_options.find("oo_constant_folding"), all_options.cend()); } + +TEST_F(NpuAttrTest, SetNpuOptimizerAttr_input_batch_cpy) { + GraphOptimizationPassOptions options; + SessionOptions session_options; + session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_do_function_inlining(true); + auto *custom_config = + session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers(); + custom_config->set_name("NpuOptimizer"); + options.session_options = &session_options; + + AttrValue input_batch_cpy = AttrValue(); + input_batch_cpy.set_b(true); + (*custom_config->mutable_parameter_map())["input_batch_cpy"] = input_batch_cpy; + + AttrValue jit_compile = AttrValue(); + jit_compile.set_s("2"); + (*custom_config->mutable_parameter_map())["jit_compile"] = jit_compile; + Status s = NpuAttrs::SetNpuOptimizerAttr(options, reinterpret_cast(1)); + EXPECT_EQ(s.ok(), false); +} + +TEST_F(NpuAttrTest, GetAllAttrOptions_input_batch_cpy) { + AttrValueMap attr_map; + + AttrValue npu_optimizer = AttrValue(); + npu_optimizer.set_s("NpuOptimizer"); + attr_map["_NpuOptimizer"] = npu_optimizer; + + AttrValue input_batch_cpy = AttrValue(); + input_batch_cpy.set_s("true"); + attr_map["_input_batch_cpy"] = input_batch_cpy; + + AttrSlice attrs(&attr_map); + const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs); + EXPECT_NE(all_options.find("ge.inputBatchCpy"), all_options.cend()); +} } } // end tensorflow diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc index 3f94a98f5..cea5f60cd 100644 --- a/tf_adapter/util/ge_plugin.cc +++ b/tf_adapter/util/ge_plugin.cc @@ -129,6 +129,7 @@ void SetOptionNameMap(json &option_name_map) { option_name_map.emplace(ge::OPTION_EXEC_DYNAMIC_EXECUTE_MODE, "dynamic_graph_execute_mode"); option_name_map.emplace(ge::OPTION_EXEC_DYNAMIC_INPUT, "dynamic_input"); option_name_map.emplace(ge::AICORE_NUM, "aicore_num"); + option_name_map.emplace("ge.inputBatchCpy", "input_batch_cpy"); } } // namespace @@ -305,6 +306,10 @@ void GePlugin::Init(std::map &init_options, const bool ADP_LOG(INFO) << "[GePlugin] oo_constant_folding : " << init_options["ge.oo.constantFolding"]; } + if (init_options.find("ge.inputBatchCpy") != init_options.end()) { + ADP_LOG(INFO) << "[GePlugin] input_batch_cpy : " << init_options["ge.inputBatchCpy"]; + } + bool tdt_uninit_env = false; (void) ReadBoolFromEnvVar("ASCEND_TDT_UNINIT", false, &tdt_uninit_env); if (!kIsHeterogeneous && !tdt_uninit_env) { diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index c6cdc0eaf..e91b5446c 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -492,6 +492,7 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr std::string input_fusion_size = "131072"; std::string compile_dynamic_mode; std::string graph_max_parallel_model_num = "1"; + std::string input_batch_cpy; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { (void) ctx->GetAttr("_variable_format_optimize", &variable_format_optimize); (void) ctx->GetAttr("_hcom_parallel", &hcom_parallel); @@ -568,6 +569,7 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr sess_options["ge.jit_compile"] = jit_compile; } (void) ctx->GetAttr("_graph_compiler_cache_dir", &graph_compiler_cache_dir); + (void) ctx->GetAttr("_input_batch_cpy", &input_batch_cpy); } // session options @@ -631,6 +633,8 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr if (!graph_compiler_cache_dir.empty()) { sess_options["ge.graph_compiler_cache_dir"] = graph_compiler_cache_dir; } + sess_options["ge.inputBatchCpy"] = input_batch_cpy; + sess_options["input_batch_cpy"] = input_batch_cpy; return sess_options; } @@ -696,6 +700,7 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr std::string export_compile_stat; std::string aicore_num; std::string oo_constant_folding; + std::string input_batch_cpy; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { (void) ctx->GetAttr("_precision_mode", &precision_mode); @@ -741,6 +746,7 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr (void) ctx->GetAttr("_export_compile_stat", &export_compile_stat); (void) ctx->GetAttr("_aicore_num", &aicore_num); (void) ctx->GetAttr("_oo_constant_folding", &oo_constant_folding); + (void) ctx->GetAttr("_input_batch_cpy", &input_batch_cpy); } std::lock_guard lock(mutex_); @@ -809,6 +815,8 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr init_options_["oo_constant_folding"] = oo_constant_folding; init_options_["ge.oo.constantFolding"] = oo_constant_folding; } + init_options_["input_batch_cpy"] = input_batch_cpy; + init_options_["ge.inputBatchCpy"] = input_batch_cpy; return init_options_; } @@ -1236,6 +1244,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & std::string export_compile_stat; std::string aicore_num; std::string oo_constant_folding; + std::string input_batch_cpy; auto NpuOptimizer_value = attrs.Find("_NpuOptimizer"); auto enable_data_pre_proc_value = attrs.Find("_enable_data_pre_proc"); @@ -1336,6 +1345,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & auto export_compile_stat_value = attrs.Find("_export_compile_stat"); auto aicore_num_value = attrs.Find("_aicore_num"); auto oo_constant_folding_value = attrs.Find("_oo_constant_folding"); + auto input_batch_cpy_value = attrs.Find("_input_batch_cpy"); if (NpuOptimizer_value != nullptr) { do_npu_optimizer = "1"; if (enable_data_pre_proc_value != nullptr) { @@ -1657,6 +1667,9 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & if (oo_constant_folding_value != nullptr) { oo_constant_folding = oo_constant_folding_value->s(); } + if (input_batch_cpy_value != nullptr) { + input_batch_cpy = input_batch_cpy_value->s(); + } } all_options["variable_format_optimize"] = variable_format_optimize; @@ -1780,6 +1793,9 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & all_options["oo_constant_folding"] = oo_constant_folding; all_options["ge.oo.constantFolding"] = oo_constant_folding; } + // input_batch_cpy + all_options["input_batch_cpy"] = input_batch_cpy; + all_options["ge.inputBatchCpy"] = input_batch_cpy; return all_options; } @@ -1907,7 +1923,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options int32_t export_compile_stat = 1; std::string aicore_num; bool oo_constant_folding = true; - + bool input_batch_cpy = false; const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options(); for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) { if (custom_optimizer.name() == "NpuOptimizer") { @@ -2482,6 +2498,13 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options init_options_["aicore_num"] = aicore_num; init_options_["ge.aicoreNum"] = aicore_num; } + // input_batch_cpy + if (params.count("input_batch_cpy") > 0) { + input_batch_cpy = params.at("input_batch_cpy").b(); + const auto input_batch_cpy_str = input_batch_cpy ? "true" : "false"; + init_options_["input_batch_cpy"] = input_batch_cpy_str; + init_options_["ge.inputBatchCpy"] = input_batch_cpy_str; + } if (params.count("jit_compile") > 0) { const static std::vector kJitCompileList = {"true", "false", @@ -2559,6 +2582,8 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options sess_options["jit_compile"] = jit_compile; sess_options["ge.jit_compile"] = jit_compile; sess_options["input_fusion_size"] = std::to_string(input_fusion_size); + sess_options["input_batch_cpy"] = std::to_string(input_batch_cpy); + sess_options["ge.inputBatchCpy"] = std::to_string(input_batch_cpy); init_options_["profiling_mode"] = std::to_string(static_cast(profiling_mode)); init_options_[ge::OPTION_EXEC_PROFILING_MODE] = std::to_string(static_cast(profiling_mode)); diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index c652f2751..18fb0e09b 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -96,6 +96,7 @@ const std::map kGlobalConfigOptions = { {"export_compile_stat", "ge.exportCompileStat"}, {"aicore_num", "ge.aicoreNum"}, {"oo_constant_folding", "ge.oo.constantFolding"}, + {"input_batch_cpy", "ge.inputBatchCpy"}, // private options {"_distribute.rank_id", ge::OPTION_EXEC_RANK_ID}, {"_distribute.rank_table", ge::OPTION_EXEC_RANK_TABLE_FILE}, diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py index 7e3b7fa7b..9e448c78a 100644 --- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py +++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py @@ -82,5 +82,6 @@ class NpuConfig(NpuBaseConfig): self.export_compile_stat = OptionValue(1, [0, 1, 2]) self.aicore_num = OptionValue(None, None) self.oo_constant_folding = OptionValue(True, [True, False]) + self.input_batch_cpy = OptionValue(False, [True, False]) super(NpuConfig, self).__init__() -- Gitee From 779334d93b10bf3d31695d339c48a644aa27c572 Mon Sep 17 00:00:00 2001 From: wangxiaotian22 Date: Tue, 6 May 2025 11:34:31 +0800 Subject: [PATCH 2/2] fix code check --- tf_adapter/util/host_queue.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tf_adapter/util/host_queue.cc b/tf_adapter/util/host_queue.cc index 7a9db2563..085f491e2 100644 --- a/tf_adapter/util/host_queue.cc +++ b/tf_adapter/util/host_queue.cc @@ -98,6 +98,7 @@ Status GetDataTypeByTensorType(acltdtTensorType tensor_type, int32_t &data_type) Status AddDataItemInfo(acltdtTensorType tdt_data_type, int32_t tensor_type, const int64_t *dims, size_t dim_size, void *data_ptr, uint64_t data_len, std::vector &items) { + CHECK_NOTNULL(dims); DataItemInfo item = {}; int32_t data_type = 0; TF_RETURN_IF_ERROR(GetDataTypeByTensorType(tdt_data_type, data_type)); -- Gitee