From 295d66655e44bd50d72cfb90c222edd14a784b61 Mon Sep 17 00:00:00 2001 From: CLAY-panjw <1330286576@qq.com> Date: Mon, 7 Nov 2022 17:26:18 +0800 Subject: [PATCH] stream sync interface timeout --- tf_adapter/interface_spec/api_npu_config.pyh | 3 +- .../npu_bridge/estimator/npu/npu_config.py | 6 +++- .../npu_bridge/estimator/npu/npu_estimator.py | 2 ++ tf_adapter/util/npu_attrs.cc | 28 +++++++++++++++++++ .../npu_device/core/npu_wrapper.cpp | 2 ++ .../python/npu_device/configs/npu_config.py | 2 ++ 6 files changed, 41 insertions(+), 2 deletions(-) diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh index f9f1a9b02..7e21e9d79 100644 --- a/tf_adapter/interface_spec/api_npu_config.pyh +++ b/tf_adapter/interface_spec/api_npu_config.pyh @@ -18,7 +18,8 @@ class NPURunConfig(run_config_lib.RunConfig): distribute_config=None, modify_mixlist=None, op_precision_mode=None, device_type="default_device_type", soc_config=None, hccl_timeout=None, op_wait_timeout=None, op_execute_timeout=None, HCCL_algorithm=None, customize_dtypes=None, op_debug_config=None, memory_config=None, experimental_config=None, - jit_compile=True, topo_sorting_mode=None, aoe_config_file=None, insert_op_file=None): + jit_compile=True, topo_sorting_mode=None, aoe_config_file=None, insert_op_file=None, stream_sync_timeout=-1, + event_sync_timeout=-1): class ProfilingConfig(): def __init__(self, enable_profiling=False, profiling_options=None): diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 721ec8eb1..2020469e8 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -103,7 +103,9 @@ class NPURunConfig(run_config_lib.RunConfig): jit_compile=True, topo_sorting_mode=None, aoe_config_file=None, - insert_op_file=None + insert_op_file=None, + stream_sync_timeout=-1, + event_sync_timeout=-1 ): """ Constructs a NPUConfig. @@ -246,6 +248,8 @@ class NPURunConfig(run_config_lib.RunConfig): self.topo_sorting_mode = topo_sorting_mode self.aoe_config_file = aoe_config_file self.insert_op_file = insert_op_file + self.stream_sync_timeout = stream_sync_timeout + self.event_sync_timeout = event_sync_timeout super(NPURunConfig, self).__init__( diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index 15447064d..ee70614c2 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -762,6 +762,8 @@ class NPUEstimator(estimator_lib.Estimator): if config.insert_op_file is not None: custom_op.parameter_map["insert_op_file"].s = config.insert_op_file custom_op.parameter_map["jit_compile"].b = config._jit_compile + custom_op.parameter_map["stream_sync_timeout"].i = config.stream_sync_timeout + custom_op.parameter_map["event_sync_timeout"].i = config.event_sync_timeout self.__load_session_device_id(config, custom_op) self.__load_modify_mixlist(config, custom_op) diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index 1aba4c183..a898e519d 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -561,6 +561,8 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr std::string logical_device_id; std::string dump_data = "tensor"; std::string aoe_config_file; + std::string stream_sync_timeout = "-1"; + std::string event_sync_timeout = "-1"; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { (void) ctx->GetAttr("_precision_mode", &precision_mode); @@ -595,6 +597,8 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr (void) ctx->GetAttr("_logical_device_id", &logical_device_id); (void) ctx->GetAttr("_dump_data", &dump_data); (void) ctx->GetAttr("_aoe_config_file", &aoe_config_file); + (void) ctx->GetAttr("_stream_sync_timeout", &stream_sync_timeout); + (void) ctx->GetAttr("_event_sync_timeout", &event_sync_timeout); } if (precision_mode.empty()) { @@ -641,6 +645,8 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr init_options_["ge.exec.dumpData"] = dump_data; init_options_["aoe_config_file"] = aoe_config_file; init_options_["ge.aoe_config_file"] = aoe_config_file; + init_options_["stream_sync_timeout"] = stream_sync_timeout; + init_options_["event_sync_timeout"] = event_sync_timeout; return init_options_; } @@ -1007,6 +1013,8 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & std::string insert_op_file; std::string resource_config_path; std::string aoe_config_file; + std::string stream_sync_timeout = "-1"; + std::string event_sync_timeout = "-1"; auto NpuOptimizer_value = attrs.Find("_NpuOptimizer"); auto enable_data_pre_proc_value = attrs.Find("_enable_data_pre_proc"); @@ -1082,6 +1090,8 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & auto insert_op_file_value = attrs.Find("_insert_op_file"); auto resource_config_path_value = attrs.Find("_resource_config_path"); auto aoe_config_file_value = attrs.Find("_aoe_config_file"); + auto stream_sync_timeout_value = attrs.Find("_stream_sync_timeout"); + auto event_sync_timeout_value = attrs.Find("_event_sync_timeout"); if (NpuOptimizer_value != nullptr) { do_npu_optimizer = "1"; @@ -1331,6 +1341,12 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & if (insert_op_file_value != nullptr) { insert_op_file = insert_op_file_value->s(); } + if (stream_sync_timeout_value != nullptr) { + stream_sync_timeout = stream_sync_timeout_value->s(); + } + if (event_sync_timeout_value != nullptr) { + event_sync_timeout = event_sync_timeout_value->s(); + } } all_options["variable_format_optimize"] = variable_format_optimize; @@ -1419,6 +1435,8 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & all_options["resource_config_path"] = resource_config_path; all_options["ge.aoe_config_file"] = aoe_config_file; all_options["aoe_config_file"] = aoe_config_file; + all_options["stream_sync_timeout"] = stream_sync_timeout; + all_options["event_sync_timeout"] = event_sync_timeout; return all_options; } @@ -1514,6 +1532,8 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options std::string logical_device_id; bool jit_compile = true; std::string aoe_config_file; + int32_t stream_sync_timeout = -1; + int32_t event_sync_timeout = -1; const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options(); @@ -1898,6 +1918,12 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options if (params.count("aoe_config_file") > 0) { aoe_config_file = params.at("aoe_config_file").s(); } + if (params.count("stream_sync_timeout") > 0) { + stream_sync_timeout = params.at("stream_sync_timeout").i(); + } + if (params.count("event_sync_timeout") > 0) { + event_sync_timeout = params.at("event_sync_timeout").i(); + } } } @@ -2011,6 +2037,8 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options init_options_["ge.exec.dumpData"] = dump_data; init_options_["aoe_config_file"] = aoe_config_file; init_options_["ge.aoe_config_file"] = aoe_config_file; + init_options_["stream_sync_timeout"] = std::to_string(stream_sync_timeout); + init_options_["event_sync_timeout"] = std::to_string(event_sync_timeout); pass_options["do_npu_optimizer"] = std::to_string(static_cast(do_npu_optimizer)); pass_options["enable_data_pre_proc"] = std::to_string(static_cast(enable_dp)); diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index c36533321..9dff04672 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -110,6 +110,8 @@ const std::map kConfigurableOptions = { {"dump_data", "ge.exec.dumpData"}, {"dump_layer", "ge.exec.dumpLayer"}, {"aoe_config_file", "ge.aoe_config_file"}, + {"stream_sync_timeout", "stream_sync_timeout"}, + {"event_sync_timeout", "event_sync_timeout"}, // private options {"_distribute.rank_id", ge::OPTION_EXEC_RANK_ID}, {"_distribute.rank_table", ge::OPTION_EXEC_RANK_TABLE_FILE}, diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py index b81be133b..f6073e47a 100644 --- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py +++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py @@ -59,6 +59,8 @@ class NpuConfig(NpuBaseConfig): self.topo_sorting_mode = OptionValue(None, [0, 1, None]) self.customize_dtypes = OptionValue(None, None) self.overflow_flag = OptionValue(1, [0, 1]) + self.stream_sync_timeout = OptionValue(-1, None) + self.event_sync_timeout = OptionValue(-1, None) # Configuration for experiment self.experimental = NpuExperimentalConfig() -- Gitee