diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh index 3d36a2245917975cbbf36ea6e05c3a8c427e2f18..4bb2268e23d97b02b9d9673debd40382ab8dd0fe 100644 --- a/tf_adapter/interface_spec/api_npu_config.pyh +++ b/tf_adapter/interface_spec/api_npu_config.pyh @@ -23,7 +23,8 @@ class NPURunConfig(run_config_lib.RunConfig): frozen_variable=False, variable_placement="Device", jit_compile="auto", precision_mode_v2=None, ac_parallel_enable=None, quant_dumpable=None, input_fusion_size=131072, compile_dynamic_mode=None, graph_max_parallel_model_num=1, export_compile_stat=1, aicore_num=None, - oo_constant_folding=True, input_batch_cpy=False, shape_generalization_mode="STRICT", all_tensor_not_empty=False): + oo_constant_folding=True, input_batch_cpy=False, shape_generalization_mode="STRICT", all_tensor_not_empty=False, + auto_multistream_parallel_mode=None): class ProfilingConfig(): def __init__(self, enable_profiling=False, profiling_options=None): diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index a38214b557b8e8e8d3c91e480729456ef3757b0f..4523153abe5a8ca45cf8e24149dac28510b2ae03 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -120,7 +120,8 @@ class NPURunConfig(run_config_lib.RunConfig): oo_constant_folding=True, input_batch_cpy=False, shape_generalization_mode="STRICT", - all_tensor_not_empty=False + all_tensor_not_empty=False, + auto_multistream_parallel_mode=None ): """ Constructs a NPUConfig. @@ -197,6 +198,7 @@ class NPURunConfig(run_config_lib.RunConfig): FULL: full generalization; ADAPTIVE: generalizes the varying axes. all_tensor_not_empty: default is: False. + auto_multistream_parallel_mode: default is None; cv: cube vector parallel. """ # Check iterations_per_loop. @@ -299,6 +301,7 @@ class NPURunConfig(run_config_lib.RunConfig): self._input_batch_cpy = input_batch_cpy self._shape_generalization_mode = shape_generalization_mode self._all_tensor_not_empty = all_tensor_not_empty + self._auto_multistream_parallel_mode = auto_multistream_parallel_mode super(NPURunConfig, self).__init__( model_dir=model_dir, tf_random_seed=tf_random_seed, diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index 52763a15a213fbad5ccbc485e17c6d92ecb5e0b6..5b8f23af227751370b096ad334c4e797c4883c79 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -853,6 +853,9 @@ class NPUEstimator(estimator_lib.Estimator): config._shape_generalization_mode) if config._all_tensor_not_empty is not None: custom_op.parameter_map["all_tensor_not_empty"].b = config._all_tensor_not_empty + if config._auto_multistream_parallel_mode is not None: + custom_op.parameter_map["auto_multistream_parallel_mode"].s = tf.compat.as_bytes( + config._auto_multistream_parallel_mode) self.__load_session_device_id(config, custom_op) self.__load_modify_mixlist(config, custom_op) diff --git a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc index fd47759aeaf99525953a9a372c7129f819c08c5a..9c60554e78f76930e6631e8e93ca162a3260ee4c 100644 --- a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc +++ b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc @@ -533,5 +533,41 @@ TEST_F(NpuAttrTest, GetAllAttrOptions_all_tensor_not_empty) { EXPECT_NE(all_options.find("ge.exec.allTensorNotEmpty"), all_options.cend()); } +TEST_F(NpuAttrTest, SetNpuOptimizerAttr_auto_multistream_parallel_mode) { + GraphOptimizationPassOptions options; + SessionOptions session_options; + session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_do_function_inlining(true); + auto *custom_config = + session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers(); + custom_config->set_name("NpuOptimizer"); + options.session_options = &session_options; + + AttrValue auto_multistream_parallel_mode = AttrValue(); + auto_multistream_parallel_mode.set_s("cv"); + (*custom_config->mutable_parameter_map())["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode; + + AttrValue jit_compile = AttrValue(); + jit_compile.set_s("2"); + (*custom_config->mutable_parameter_map())["jit_compile"] = jit_compile; + Status s = NpuAttrs::SetNpuOptimizerAttr(options, reinterpret_cast(1)); + EXPECT_EQ(s.ok(), false); +} + +TEST_F(NpuAttrTest, GetAllAttrOptions_auto_multistream_parallel_mode) { + AttrValueMap attr_map; + + AttrValue npu_optimizer = AttrValue(); + npu_optimizer.set_s("NpuOptimizer"); + attr_map["_NpuOptimizer"] = npu_optimizer; + + AttrValue auto_multistream_parallel_mode = AttrValue(); + auto_multistream_parallel_mode.set_s("cv"); + attr_map["_auto_multistream_parallel_mode"] = auto_multistream_parallel_mode; + + AttrSlice attrs(&attr_map); + const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs); + EXPECT_NE(all_options.find("ge.autoMultistreamParallelMode"), all_options.cend()); +} + } } // end tensorflow diff --git a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc index 006141158805f89fe9b63d320456f2237ca52830..8ecc3801df1d7805bc6e0e7838726b38a408ce19 100644 --- a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc +++ b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc @@ -671,5 +671,41 @@ TEST_F(NpuAttrTest, GetAllAttrOptions_all_tensor_not_empty) { const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs); EXPECT_NE(all_options.find("ge.exec.allTensorNotEmpty"), all_options.cend()); } + +TEST_F(NpuAttrTest, SetNpuOptimizerAttr_auto_multistream_parallel_mode) { + GraphOptimizationPassOptions options; + SessionOptions session_options; + session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_do_function_inlining(true); + auto *custom_config = + session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers(); + custom_config->set_name("NpuOptimizer"); + options.session_options = &session_options; + + AttrValue auto_multistream_parallel_mode = AttrValue(); + auto_multistream_parallel_mode.set_s("cv"); + (*custom_config->mutable_parameter_map())["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode; + + AttrValue jit_compile = AttrValue(); + jit_compile.set_s("2"); + (*custom_config->mutable_parameter_map())["jit_compile"] = jit_compile; + Status s = NpuAttrs::SetNpuOptimizerAttr(options, reinterpret_cast(1)); + EXPECT_EQ(s.ok(), false); +} + +TEST_F(NpuAttrTest, GetAllAttrOptions_auto_multistream_parallel_mode) { + AttrValueMap attr_map; + + AttrValue npu_optimizer = AttrValue(); + npu_optimizer.set_s("NpuOptimizer"); + attr_map["_NpuOptimizer"] = npu_optimizer; + + AttrValue auto_multistream_parallel_mode = AttrValue(); + auto_multistream_parallel_mode.set_s("cv"); + attr_map["_auto_multistream_parallel_mode"] = auto_multistream_parallel_mode; + + AttrSlice attrs(&attr_map); + const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs); + EXPECT_NE(all_options.find("ge.autoMultistreamParallelMode"), all_options.cend()); +} } } // end tensorflow diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc index 62ced776af7d686be43f69242377ab2ac945148c..2ae110fd7a70a701f21fa2cc7446b757ad6caed9 100644 --- a/tf_adapter/util/ge_plugin.cc +++ b/tf_adapter/util/ge_plugin.cc @@ -130,6 +130,7 @@ void SetOptionNameMap(json &option_name_map) { option_name_map.emplace(ge::AICORE_NUM, "aicore_num"); option_name_map.emplace("ge.inputBatchCpy", "input_batch_cpy"); option_name_map.emplace(ge::OPTION_ALL_TENSOR_NOT_EMPTY, "all_tensor_not_empty"); + option_name_map.emplace("ge.autoMultistreamParallelMode", "auto_multistream_parallel_mode"); } } // namespace diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index bfbb0063a8d00991c4810722efba9f25b7818346..7a2942ff59543a3c3b775369ecfd991ad53ffee4 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -508,6 +508,7 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr std::string jit_compile; std::string aicore_num; std::string all_tensor_not_empty; + std::string auto_multistream_parallel_mode; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { (void) ctx->GetAttr("_variable_format_optimize", &variable_format_optimize); (void) ctx->GetAttr("_hcom_parallel", &hcom_parallel); @@ -584,6 +585,7 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr (void) ctx->GetAttr("_input_batch_cpy", &input_batch_cpy); (void) ctx->GetAttr("_aicore_num", &aicore_num); (void) ctx->GetAttr("_all_tensor_not_empty", &all_tensor_not_empty); + (void) ctx->GetAttr("_auto_multistream_parallel_mode", &auto_multistream_parallel_mode); } // session options @@ -654,6 +656,8 @@ std::map NpuAttrs::GetSessOptions(const OpKernelConstr sess_options["input_batch_cpy"] = input_batch_cpy; sess_options[ge::OPTION_ALL_TENSOR_NOT_EMPTY] = all_tensor_not_empty; sess_options["all_tensor_not_empty"] = all_tensor_not_empty; + sess_options["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode; + sess_options["ge.autoMultistreamParallelMode"] = auto_multistream_parallel_mode; SetForbiddenClosePassOn(sess_options); sess_options["aicore_num"] = aicore_num; sess_options["ge.aicoreNum"] = aicore_num; @@ -1268,6 +1272,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & std::string input_batch_cpy; std::string shape_generalization_mode = "STRICT"; std::string all_tensor_not_empty; + std::string auto_multistream_parallel_mode; auto NpuOptimizer_value = attrs.Find("_NpuOptimizer"); auto enable_data_pre_proc_value = attrs.Find("_enable_data_pre_proc"); @@ -1369,6 +1374,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & auto input_batch_cpy_value = attrs.Find("_input_batch_cpy"); auto shape_generalization_mode_value = attrs.Find("_shape_generalization_mode"); auto all_tensor_not_empty_value = attrs.Find("_all_tensor_not_empty"); + auto auto_multistream_parallel_mode_value = attrs.Find("_auto_multistream_parallel_mode"); if (NpuOptimizer_value != nullptr) { do_npu_optimizer = "1"; if (enable_data_pre_proc_value != nullptr) { @@ -1692,6 +1698,9 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & if (all_tensor_not_empty_value != nullptr) { all_tensor_not_empty = all_tensor_not_empty_value->s(); } + if (auto_multistream_parallel_mode_value != nullptr) { + auto_multistream_parallel_mode = auto_multistream_parallel_mode_value->s(); + } } all_options["variable_format_optimize"] = variable_format_optimize; @@ -1809,6 +1818,8 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & all_options["ge.aicoreNum"] = aicore_num; all_options[ge::OPTION_ALL_TENSOR_NOT_EMPTY] = all_tensor_not_empty; all_options["all_tensor_not_empty"] = all_tensor_not_empty; + all_options["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode; + all_options["ge.autoMultistreamParallelMode"] = auto_multistream_parallel_mode; if (!oo_constant_folding.empty()) { all_options["oo_constant_folding"] = oo_constant_folding; all_options["ge.oo.constantFolding"] = oo_constant_folding; @@ -1946,6 +1957,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options bool input_batch_cpy = false; std::string shape_generalization_mode = "STRICT"; bool all_tensor_not_empty = false; + std::string auto_multistream_parallel_mode; const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options(); for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) { if (custom_optimizer.name() == "NpuOptimizer") { @@ -2517,6 +2529,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options if (params.count("all_tensor_not_empty") > 0) { all_tensor_not_empty = params.at("all_tensor_not_empty").b(); } + if (params.count("auto_multistream_parallel_mode") > 0) { + auto_multistream_parallel_mode = params.at("auto_multistream_parallel_mode").s(); + } // input_batch_cpy if (params.count("input_batch_cpy") > 0) { input_batch_cpy = params.at("input_batch_cpy").b(); @@ -2618,6 +2633,8 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options graph_options["input_shape"] = input_shape; graph_options["dynamic_dims"] = dynamic_dims; graph_options["dynamic_node_type"] = std::to_string(dynamic_node_type); + sess_options["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode; + sess_options["ge.autoMultistreamParallelMode"] = auto_multistream_parallel_mode; init_options_["profiling_mode"] = std::to_string(static_cast(profiling_mode)); init_options_[ge::OPTION_EXEC_PROFILING_MODE] = std::to_string(static_cast(profiling_mode)); init_options_["profiling_options"] = profiling_options; diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index 553569f0dd134abeabd3b50d26bb2bce77d94e8f..2671a05795a60610450629d031485f5d64104d9f 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -151,7 +151,8 @@ const std::map kSessionConfigOptions = { {"graph_slice", "ge.graphSliceMode"}, {"input_fusion_size", "ge.exec.input_fusion_size"}, {"compile_dynamic_mode", "ge.compile_dynamic_mode"}, - {"all_tensor_not_empty", ge::OPTION_ALL_TENSOR_NOT_EMPTY} + {"all_tensor_not_empty", ge::OPTION_ALL_TENSOR_NOT_EMPTY}, + {"auto_multistream_parallel_mode", "ge.autoMultistreamParallelMode"} }; } // namespace diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py index 3293e3c32419120bea37ea200e610eec41d00978..b18bf56d9ac7f425ce7e7de028f85b708312b028 100644 --- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py +++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py @@ -85,5 +85,6 @@ class NpuConfig(NpuBaseConfig): self.input_batch_cpy = OptionValue(False, [True, False]) self.shape_generalization_mode = OptionValue("STRICT", ["STRICT", "FULL", "ADAPTIVE"]) self.all_tensor_not_empty = OptionValue(False, [True, False]) + self.auto_multistream_parallel_mode = OptionValue(None, ['cv']) super(NpuConfig, self).__init__() diff --git a/tf_adapter_2.x/tests/stub/include/stub/defines.h b/tf_adapter_2.x/tests/stub/include/stub/defines.h index 45dc1da2a165308d6ec69b165bba9a5b5f40ef23..c29da28aa2e0bac75b6aa83f249002bcaa637383 100644 --- a/tf_adapter_2.x/tests/stub/include/stub/defines.h +++ b/tf_adapter_2.x/tests/stub/include/stub/defines.h @@ -66,6 +66,7 @@ const char *const OPTION_EXEC_LOGICAL_DEVICE_ID = "ge.exec.logicalDeviceId"; const char *const OPTION_EXEC_MODEL_DEPLOY_MODE = "ge.exec.modelDeployMode"; const char *const OPTION_EXEC_MODEL_DEPLOY_DEVICELIST = "ge.exec.modelDeployDevicelist"; const char *const OPTION_ALL_TENSOR_NOT_EMPTY = "ge.exec.allTensorNotEmpty"; +const char *const OPTION_AUTO_MULTISTREAM_PARALLEL_MODE = "ge.autoMultistreamParallelMode"; // Option key: memory init const char *const GRAPH_MEMORY_MAX_SIZE = "ge.graphMemoryMaxSize";