diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh index 915c76ea6b3451bee8012a099a9863477901b98e..5a80153d915b3e20ab212901cbd6a9aa4c3f7a5d 100644 --- a/tf_adapter/interface_spec/api_npu_config.pyh +++ b/tf_adapter/interface_spec/api_npu_config.pyh @@ -18,7 +18,8 @@ class NPURunConfig(run_config_lib.RunConfig): distribute_config=None, modify_mixlist=None, op_precision_mode=None, device_type="default_device_type", soc_config=None, hccl_timeout=None, op_wait_timeout=None, op_execute_timeout=None, HCCL_algorithm=None, customize_dtypes=None, op_debug_config=None, memory_config=None, experimental_config=None, - jit_compile=True, topo_sorting_mode=None, aoe_config_file=None, insert_op_file=None): + jit_compile=True, topo_sorting_mode=None, aoe_config_file=None, insert_op_file=None, stream_sync_timeout=-1, + event_sync_timeout=-1): class ProfilingConfig(): def __init__(self, enable_profiling=False, profiling_options=None): diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_callbacks.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_callbacks.py index 89e33519c95bb9e3dd4f99fd8a06ea6d2301f433..697945b5e75df70bd7605e6581410d1d340eb05b 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_callbacks.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_callbacks.py @@ -22,6 +22,7 @@ from tensorflow.python.keras import backend from tensorflow.python.ops import state_ops from tensorflow.python.ops import control_flow_ops from npu_bridge.hccl import hccl_ops +from npu_bridge.estimator.npu import util as util_lib def broadcast_global_variables(root_rank): @@ -54,7 +55,7 @@ class BroadcastGlobalVariablesCallbackImpl: if self.broadcast_done: return - rank_size = os.getenv("RANK_SIZE", "1") + rank_size = util_lib.get_rank_size() if int(rank_size) > 1: bcast_op = broadcast_global_variables(self.root_rank) backend.get_session().run(bcast_op) diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index f4d1f1277249e8d9c0c0d6c68d407aeed5c44266..9234612dfceeb829b37de18e983db99c4257f93c 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -102,7 +102,9 @@ class NPURunConfig(run_config_lib.RunConfig): jit_compile=True, topo_sorting_mode=None, aoe_config_file=None, - insert_op_file=None + insert_op_file=None, + stream_sync_timeout=-1, + event_sync_timeout=-1 ): """ Constructs a NPUConfig. @@ -243,6 +245,8 @@ class NPURunConfig(run_config_lib.RunConfig): self.topo_sorting_mode = topo_sorting_mode self.aoe_config_file = aoe_config_file self.insert_op_file = insert_op_file + self.stream_sync_timeout = stream_sync_timeout + self.event_sync_timeout = event_sync_timeout super(NPURunConfig, self).__init__( diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index de4e7c11ffffd158553e3501ec7f67c3ad3fc87f..e7389a4b34b3e2ef4ab219e253198399dae6b7cf 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -37,6 +37,7 @@ from tensorflow.python.estimator import estimator as estimator_lib from tensorflow.python.estimator import model_fn as model_fn_lib from tensorflow.python.util import function_utils from tensorflow.python.util import tf_inspect +from npu_bridge.estimator.npu import util as util_lib from npu_bridge.estimator.npu import util from npu_bridge.estimator.npu.npu_config import NPURunConfig @@ -380,7 +381,7 @@ class NPUEstimator(estimator_lib.Estimator): raise RuntimeError('estimator_spec used by NPU train must have type ' '`NPUEstimatorSpec` or `EstimatorSpec`. Got {}'.format(type(estimator_spec))) # 1. NPUBroadcastGlobalVariablesHook - rank_size = os.getenv('RANK_SIZE') + rank_size = util_lib.get_rank_size() if rank_size is not None and rank_size.isdigit() and int(rank_size) > 1 and not config.horovod_mode: npu_hooks.append( NPUBroadcastGlobalVariablesHook(self.__device_info._root_rank, self.__device_info._index)) @@ -758,6 +759,8 @@ class NPUEstimator(estimator_lib.Estimator): if config.insert_op_file is not None: custom_op.parameter_map["insert_op_file"].s = config.insert_op_file custom_op.parameter_map["jit_compile"].b = config._jit_compile + custom_op.parameter_map["stream_sync_timeout"].i = config.stream_sync_timeout + custom_op.parameter_map["event_sync_timeout"].i = config.event_sync_timeout self.__load_session_device_id(config, custom_op) self.__load_modify_mixlist(config, custom_op) diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py index 4a6455a6ea642b6d922f04f99911fbd04e1de1ea..d60df35c83c96b61d71187efc28e8c61e0ba1765 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py @@ -28,6 +28,7 @@ from tensorflow.python.ops import summary_ops_v2 as contrib_summary from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import session_run_hook from tensorflow.python.training import basic_session_run_hooks +from npu_bridge.estimator.npu import util as util_lib from npu_bridge.estimator import npu_ops from npu_bridge.hccl import hccl_ops @@ -89,7 +90,7 @@ class NPUBroadcastGlobalVariablesHook(session_run_hook.SessionRunHook): self._root_rank = root_rank self._index = index self._bcast_op = None - rank_size = os.getenv('RANK_SIZE', "1") + rank_size = util_lib.get_rank_size() if rank_size.isdigit(): self._rank_size = int(rank_size) else: diff --git a/tf_adapter/python/npu_bridge/estimator/npu/util.py b/tf_adapter/python/npu_bridge/estimator/npu/util.py index 7fb62822ebb339ba2c3347770dffb4906f70d097..e22f5fa419a7a2a240365d81e0b28ea4ac0a5411 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/util.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/util.py @@ -232,6 +232,14 @@ def set_iteration_per_loop(sess, train_op, iterations_per_loop=1): return group_train_op +def get_rank_size(): + if os.getenv("CM_WORK_SIZE") is not None and os.getenv("RANK_SIZE") is not None: + raise ValueError("RANK_SIZE and CM_WORK_SIZE cannot be configured at the same time") + rank_size = os.getenv('RANK_SIZE') if os.getenv( + "RANK_SIZE") is not None else os.getenv('CM_WORK_SIZE', '1') + return rank_size + + class IterationPerLoop(): """ An object provide two API to create and set iterations_per_loop diff --git a/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc b/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc index a89b4de00e53c86be2a5d37ec16ccb49bc03fd39..754faf743f5a7cd95f592f1735e046b006c0b6d4 100644 --- a/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc +++ b/tf_adapter/tests/st/util/testcase/ge_plugin_test.cc @@ -30,6 +30,52 @@ TEST_F(GePluginTest, PluginInitTest) { PluginInit(init_options); } +TEST_F(GePluginTest, PluginInitTest_fail) { + std::map init_options; + setenv("JOB_ID", "1000", true); + setenv("CM_WORK_SIZE", "1", true); + setenv("RANK_ID", "0", true); + setenv("POD_NAME", "0", true); + setenv("CM_CHIEF_IP", "11", true); + setenv("CM_CHIEF_PORT", "22", true); + setenv("CM_CHIEF_DEVICE", "8", true); + setenv("CM_WORKER_IP", "127.0.0.1", true); + setenv("FUSION_TENSOR_SIZE", "524288000", true); + std::string tf_config = "{'task':{'type':'a'}, 'cluster':{'chief':['1']}}"; + setenv("TF_CONFIG", tf_config.c_str(), true); + init_options["ge.exec.profilingMode"] = "1"; + init_options["ge.exec.profilingOptions"] = "trace"; + init_options["ge.exec.precision_mode"] = "allow_fp32_to_fp16"; + init_options["ge.autoTuneMode"] = "GA"; + init_options["ge.opDebugLevel"] = "1"; + init_options["ge.jobType"] = "2"; + PluginInit(init_options); +} + +TEST_F(GePluginTest, PluginInitTest_hccl) { + std::map init_options; + unsetenv("RANK_SIZE"); + unsetenv("RANK_TABLE_FILE"); + setenv("JOB_ID", "1000", true); + setenv("CM_WORK_SIZE", "1", true); + setenv("RANK_ID", "0", true); + setenv("POD_NAME", "0", true); + setenv("CM_CHIEF_IP", "11", true); + setenv("CM_CHIEF_PORT", "22", true); + setenv("CM_CHIEF_DEVICE", "8", true); + setenv("CM_WORKER_IP", "127.0.0.1", true); + setenv("FUSION_TENSOR_SIZE", "524288000", true); + std::string tf_config = "{'task':{'type':'a'}, 'cluster':{'chief':['1']}}"; + setenv("TF_CONFIG", tf_config.c_str(), true); + init_options["ge.exec.profilingMode"] = "1"; + init_options["ge.exec.profilingOptions"] = "trace"; + init_options["ge.exec.precision_mode"] = "allow_fp32_to_fp16"; + init_options["ge.autoTuneMode"] = "GA"; + init_options["ge.opDebugLevel"] = "1"; + init_options["ge.jobType"] = "2"; + PluginInit(init_options); +} + TEST_F(GePluginTest, PluginFinalizeTest) { PluginFinalize(); } @@ -112,6 +158,5 @@ TEST_F(GePluginTest, RdmaInitAndRegisterOKTest) { int32_t ret = RdmaInitAndRegister(var_info, size); EXPECT_EQ(ret, 0); } - } } // end tensorflow \ No newline at end of file diff --git a/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc b/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc index a89b4de00e53c86be2a5d37ec16ccb49bc03fd39..bb0bf3c4ee5e7079d7f5c8ba5f69520d4027df43 100644 --- a/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc +++ b/tf_adapter/tests/ut/util/testcase/ge_plugin_test.cc @@ -11,6 +11,24 @@ class GePluginTest : public testing::Test { virtual void TearDown() {} }; +TEST_F(GePluginTest, PluginInitTest_1) { + std::map init_options; + setenv("JOB_ID", "1000", true); + setenv("RANK_SIZE", "1", true); + setenv("RANK_ID", "0", true); + setenv("RANK_TABLE_FILE", "rank_table", true); + setenv("FUSION_TENSOR_SIZE", "524288000", true); + std::string tf_config = "{'task':{'type':'a'}, 'cluster':{'chief':['1']}}"; + setenv("TF_CONFIG", tf_config.c_str(), true); + init_options["ge.exec.profilingMode"] = "1"; + init_options["ge.exec.profilingOptions"] = "trace"; + init_options["ge.exec.precision_mode"] = "allow_fp32_to_fp16"; + init_options["ge.autoTuneMode"] = "GA"; + init_options["ge.opDebugLevel"] = "1"; + init_options["ge.jobType"] = "2"; + PluginInit(init_options); +} + TEST_F(GePluginTest, PluginInitTest) { std::map init_options; setenv("JOB_ID", "1000", true); @@ -30,6 +48,29 @@ TEST_F(GePluginTest, PluginInitTest) { PluginInit(init_options); } +TEST_F(GePluginTest, PluginInitTest_hccl) { + std::map init_options; + unsetenv("RANK_SIZE"); + unsetenv("RANK_TABLE_FILE"); + setenv("JOB_ID", "1000", true); + setenv("CM_WORKER_SIZE", "1", true); + setenv("RANK_ID", "0", true); + setenv("CM_CHIEF_IP", "11", true); + setenv("CM_CHIEF_PORT", "22", true); + setenv("CM_CHIEF_DEVICE", "8", true); + setenv("CM_WORKER_IP", "127.0.0.1", true); + setenv("FUSION_TENSOR_SIZE", "524288000", true); + std::string tf_config = "{'task':{'type':'a'}, 'cluster':{'chief':['1']}}"; + setenv("TF_CONFIG", tf_config.c_str(), true); + init_options["ge.exec.profilingMode"] = "1"; + init_options["ge.exec.profilingOptions"] = "trace"; + init_options["ge.exec.precision_mode"] = "allow_fp32_to_fp16"; + init_options["ge.autoTuneMode"] = "GA"; + init_options["ge.opDebugLevel"] = "1"; + init_options["ge.jobType"] = "2"; + PluginInit(init_options); +} + TEST_F(GePluginTest, PluginFinalizeTest) { PluginFinalize(); } diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc index 0743023edf4b62d7494b90dac071f88a29473cb4..2523117c17cd99040836140527ca64d684e08889 100644 --- a/tf_adapter/util/ge_plugin.cc +++ b/tf_adapter/util/ge_plugin.cc @@ -36,6 +36,8 @@ using namespace tdt; using namespace tensorflow; namespace { const int kFatalSleepTime = 3000; +const int64 kInvalidRankSize = -1; +const int64 kDefaultRankSize = 1; inline string ToString(ge::Status status) { return ::ge::StatusFactory::Instance()->GetErrDesc(status); } @@ -128,39 +130,21 @@ void GePlugin::Init(std::map &init_options, const bool LOG(WARNING) << "[GePlugin] can not find Environment variable : JOB_ID"; } - int64 rankSizeNum = 1; - (void) ReadInt64FromEnvVar("RANK_SIZE", 1, &rankSizeNum); - if (rankSizeNum > UINT32_MAX) { - rankSizeNum = UINT32_MAX; - ADP_LOG(WARNING) << "[GePlugin] RANK_SIZE is larger than UINT32_MAX, set to UINT32_MAX."; - LOG(WARNING) << "[GePlugin] RANK_SIZE is larger than UINT32_MAX, set to UINT32_MAX."; - } - - bool is_use_hcom = false; - bool deploy_mode = false; + std::string cm_chief_ip; + (void) ReadStringFromEnvVar("CM_CHIEF_IP", "", &cm_chief_ip); + (void) ReadInt64FromEnvVar("CM_WORKER_SIZE", kInvalidRankSize, &work_size_num); std::string env_rank_table_file; (void) ReadStringFromEnvVar("RANK_TABLE_FILE", "", &env_rank_table_file); - if (!env_rank_table_file.empty() && (rankSizeNum > 0)) { - ADP_LOG(INFO) << "[GePlugin] env RANK_TABLE_FILE:" << env_rank_table_file; - is_use_hcom = true; - init_options[ge::OPTION_EXEC_RANK_TABLE_FILE] = env_rank_table_file; - std::string env_pod_name; - (void) ReadStringFromEnvVar("POD_NAME", "", &env_pod_name); - if (!env_pod_name.empty()) { - deploy_mode = true; - init_options[ge::OPTION_EXEC_POD_NAME] = env_pod_name; - } else { - std::string env_rank_id; - (void) ReadStringFromEnvVar("RANK_ID", "", &env_rank_id); - if (!env_rank_id.empty()) { - ADP_LOG(INFO) << "[GePlugin] env RANK_ID:" << env_rank_id; - deploy_mode = false; - init_options[ge::OPTION_EXEC_RANK_ID] = env_rank_id; - } else { - ADP_LOG(ERROR) << "[GePlugin] Can't find rank_id or pod_name in env."; - LOG(ERROR) << "[GePlugin] Can't find rank_id or pod_name in env."; - } - } + (void) ReadInt64FromEnvVar("RANK_SIZE", kInvalidRankSize, &rank_size_num); + if (!cm_chief_ip.empty() && !env_rank_table_file.empty()) { + ADP_LOG(ERROR) << "[GePlugin] CM_CHIEF_IP and RANK_TABLE_FILE cannot be configured at the same time."; + LOG(ERROR) << "[GePlugin] CM_CHIEF_IP and RANK_TABLE_FILE cannot be configured at the same time."; + } else if (!cm_chief_ip.empty()) { + SetCmChiefWorkSizeEnv(init_options, cm_chief_ip); + } else if (!env_rank_table_file.empty()) { + SetRankTableFileEnv(init_options, env_rank_table_file); + } else { + // do nothing; } std::string cluster_info; @@ -266,6 +250,66 @@ void GePlugin::Init(std::map &init_options, const bool isGlobal_ = is_global; } +void GePlugin::SetRankTableFileEnv(std::map &init_options, std::string &rankTableFile) { + rank_size_num = (rank_size_num == kInvalidRankSize) ? kDefaultRankSize : rank_size_num; + if (rank_size_num > UINT32_MAX) { + rank_size_num = UINT32_MAX; + ADP_LOG(WARNING) << "[GePlugin] RANK_SIZE is larger than UINT32_MAX, set to UINT32_MAX."; + LOG(WARNING) << "[GePlugin] RANK_SIZE is larger than UINT32_MAX, set to UINT32_MAX."; + } + if (!rankTableFile.empty() && (rank_size_num > 0) && (work_size_num == kInvalidRankSize)) { + ADP_LOG(INFO) << "[GePlugin] env RANK_TABLE_FILE:" << rankTableFile; + is_use_hcom = true; + init_options[ge::OPTION_EXEC_RANK_TABLE_FILE] = rankTableFile; + std::string env_pod_name; + (void) ReadStringFromEnvVar("POD_NAME", "", &env_pod_name); + if (!env_pod_name.empty()) { + deploy_mode = true; + init_options[ge::OPTION_EXEC_POD_NAME] = env_pod_name; + } else { + std::string env_rank_id; + (void) ReadStringFromEnvVar("RANK_ID", "", &env_rank_id); + if (!env_rank_id.empty()) { + ADP_LOG(INFO) << "[GePlugin] env RANK_ID:" << env_rank_id; + deploy_mode = false; + init_options[ge::OPTION_EXEC_RANK_ID] = env_rank_id; + } else { + ADP_LOG(ERROR) << "[GePlugin] Can't find rank_id or pod_name in env."; + LOG(ERROR) << "[GePlugin] Can't find rank_id or pod_name in env."; + } + } + } +} + +void GePlugin::SetCmChiefWorkSizeEnv(std::map &init_options, std::string &cmChiefIp) { + std::string cm_chief_port; + (void) ReadStringFromEnvVar("CM_CHIEF_PORT", "", &cm_chief_port); + std::string cm_chief_device; + (void) ReadStringFromEnvVar("CM_CHIEF_DEVICE", "", &cm_chief_device); + std::string cm_worker_ip; + (void) ReadStringFromEnvVar("CM_WORKER_IP", "", &cm_worker_ip); + std::string cm_worker_size; + (void) ReadStringFromEnvVar("CM_WORKER_SIZE", "", &cm_worker_size); + work_size_num = (work_size_num == kInvalidRankSize) ? kDefaultRankSize : work_size_num; + if (work_size_num > UINT32_MAX) { + work_size_num = UINT32_MAX; + ADP_LOG(WARNING) << "[GePlugin] RANK_SIZE is larger than UINT32_MAX, set to UINT32_MAX."; + LOG(WARNING) << "[GePlugin] RANK_SIZE is larger than UINT32_MAX, set to UINT32_MAX."; + } + if (!cmChiefIp.empty() && !cm_chief_port.empty() && !cm_chief_device.empty() && (work_size_num > 0) && (rank_size_num == kInvalidRankSize)) { + is_use_hcom = true; + init_options_["ge.cmChiefIp"] = cmChiefIp; + init_options["ge.cmChiefPort"] = cm_chief_port; + init_options["ge.cmChiefWorkerDevice"] = cm_chief_device; + if (!cm_worker_ip.empty()) { + init_options["ge.cmWorkerIp"] = cm_worker_ip; + } + if (!cm_worker_size.empty()) { + init_options["ge.cmWorkerSize"] = cm_worker_size; + } + } +} + std::map GePlugin::GetInitOptions() { return init_options_; } @@ -312,15 +356,12 @@ bool GePlugin::IsGlobal() { } static CancellationManager g_cancellationManager; -Status RegisterNpuCancellationCallback(std::function callback, - std::function* deregister_fn) { +Status RegisterNpuCancellationCallback(std::function callback, std::function *deregister_fn) { CancellationToken token = g_cancellationManager.get_cancellation_token(); if (!g_cancellationManager.RegisterCallback(token, std::move(callback))) { return errors::Cancelled("Operation was cancelled"); } - *deregister_fn = [token]() { - g_cancellationManager.DeregisterCallback(token); - }; + *deregister_fn = [token]() { g_cancellationManager.DeregisterCallback(token); }; return Status::OK(); } @@ -353,8 +394,8 @@ void AoeFinalizeIfNeed() { return; } - (void)aoe_finalize(); - (void)mmDlclose(handle); + (void) aoe_finalize(); + (void) mmDlclose(handle); ADP_LOG(INFO) << "Finish to call aoe finalize when npu close."; } diff --git a/tf_adapter/util/ge_plugin.h b/tf_adapter/util/ge_plugin.h index 41f2a03753a7facc090a3b74879665dd981a71a7..ac25effbb09f5b541247bf654f0a11508c8443d3 100644 --- a/tf_adapter/util/ge_plugin.h +++ b/tf_adapter/util/ge_plugin.h @@ -37,6 +37,10 @@ class GePlugin { std::map GetInitOptions(); + void SetRankTableFileEnv(std::map &init_options, std::string &rankTableFile); + + void SetCmChiefWorkSizeEnv(std::map &init_options, std::string &cmChiefIp); + private: GePlugin(); @@ -47,6 +51,10 @@ class GePlugin { uint32_t device_id_; bool isInit_; bool isGlobal_; + bool is_use_hcom = false; + bool deploy_mode = false; + tensorflow::int64 work_size_num; + tensorflow::int64 rank_size_num; std::map init_options_; std::mutex mutex_; static std::atomic_int graph_counter_; diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index 92060e8106e869b864afc49308beec31fca7ecbd..90e2f01da9016f2a617c3ac650db9eba263efe9c 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -561,6 +561,8 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr std::string model_deploy_devicelist; std::string dump_data = "tensor"; std::string aoe_config_file; + std::string stream_sync_timeout = "-1"; + std::string event_sync_timeout = "-1"; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { (void) ctx->GetAttr("_precision_mode", &precision_mode); @@ -596,6 +598,8 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr (void) ctx->GetAttr("_model_deploy_devicelist", &model_deploy_devicelist); (void) ctx->GetAttr("_dump_data", &dump_data); (void) ctx->GetAttr("_aoe_config_file", &aoe_config_file); + (void) ctx->GetAttr("_stream_sync_timeout", &stream_sync_timeout); + (void) ctx->GetAttr("_event_sync_timeout", &event_sync_timeout); } if (precision_mode.empty()) { @@ -643,6 +647,8 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr init_options_["ge.exec.dumpData"] = dump_data; init_options_["aoe_config_file"] = aoe_config_file; init_options_["ge.aoe_config_file"] = aoe_config_file; + init_options_["stream_sync_timeout"] = stream_sync_timeout; + init_options_["event_sync_timeout"] = event_sync_timeout; return init_options_; } @@ -1010,6 +1016,8 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & std::string insert_op_file; std::string resource_config_path; std::string aoe_config_file; + std::string stream_sync_timeout = "-1"; + std::string event_sync_timeout = "-1"; auto NpuOptimizer_value = attrs.Find("_NpuOptimizer"); auto enable_data_pre_proc_value = attrs.Find("_enable_data_pre_proc"); @@ -1086,6 +1094,8 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & auto insert_op_file_value = attrs.Find("_insert_op_file"); auto resource_config_path_value = attrs.Find("_resource_config_path"); auto aoe_config_file_value = attrs.Find("_aoe_config_file"); + auto stream_sync_timeout_value = attrs.Find("_stream_sync_timeout"); + auto event_sync_timeout_value = attrs.Find("_event_sync_timeout"); if (NpuOptimizer_value != nullptr) { do_npu_optimizer = "1"; @@ -1338,6 +1348,12 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & if (insert_op_file_value != nullptr) { insert_op_file = insert_op_file_value->s(); } + if (stream_sync_timeout_value != nullptr) { + stream_sync_timeout = stream_sync_timeout_value->s(); + } + if (event_sync_timeout_value != nullptr) { + event_sync_timeout = event_sync_timeout_value->s(); + } } all_options["variable_format_optimize"] = variable_format_optimize; @@ -1427,6 +1443,8 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & all_options["resource_config_path"] = resource_config_path; all_options["ge.aoe_config_file"] = aoe_config_file; all_options["aoe_config_file"] = aoe_config_file; + all_options["stream_sync_timeout"] = stream_sync_timeout; + all_options["event_sync_timeout"] = event_sync_timeout; return all_options; } @@ -1523,6 +1541,8 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options std::string model_deploy_devicelist; bool jit_compile = true; std::string aoe_config_file; + int32_t stream_sync_timeout = -1; + int32_t event_sync_timeout = -1; const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options(); @@ -1910,6 +1930,12 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options if (params.count("aoe_config_file") > 0) { aoe_config_file = params.at("aoe_config_file").s(); } + if (params.count("stream_sync_timeout") > 0) { + stream_sync_timeout = params.at("stream_sync_timeout").i(); + } + if (params.count("event_sync_timeout") > 0) { + event_sync_timeout = params.at("event_sync_timeout").i(); + } } } @@ -2025,6 +2051,8 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options init_options_["ge.exec.dumpData"] = dump_data; init_options_["aoe_config_file"] = aoe_config_file; init_options_["ge.aoe_config_file"] = aoe_config_file; + init_options_["stream_sync_timeout"] = std::to_string(stream_sync_timeout); + init_options_["event_sync_timeout"] = std::to_string(event_sync_timeout); pass_options["do_npu_optimizer"] = std::to_string(static_cast(do_npu_optimizer)); pass_options["enable_data_pre_proc"] = std::to_string(static_cast(enable_dp)); diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index bc97f8a6d92542eb4e56971eeeb1dea289c8219e..b650412dc91e3f646a52f388083d77656db55b36 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -111,6 +111,8 @@ const std::map kConfigurableOptions = { {"dump_data", "ge.exec.dumpData"}, {"dump_layer", "ge.exec.dumpLayer"}, {"aoe_config_file", "ge.aoe_config_file"}, + {"stream_sync_timeout", "stream_sync_timeout"}, + {"event_sync_timeout", "event_sync_timeout"}, // private options {"_distribute.rank_id", ge::OPTION_EXEC_RANK_ID}, {"_distribute.rank_table", ge::OPTION_EXEC_RANK_TABLE_FILE}, diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py index daa26cfdcf895c416356de288c84f9c92f858dc8..96d9887b1b7cd5e5d622342c8a38022f11d693aa 100644 --- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py +++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py @@ -58,6 +58,8 @@ class NpuConfig(NpuBaseConfig): self.topo_sorting_mode = OptionValue(None, [0, 1, None]) self.customize_dtypes = OptionValue(None, None) self.overflow_flag = OptionValue(1, [0, 1]) + self.stream_sync_timeout = OptionValue(-1, None) + self.event_sync_timeout = OptionValue(-1, None) # Configuration for experiment self.experimental = NpuExperimentalConfig() diff --git a/tf_adapter_2.x/python/npu_device/distribute/npu_callbacks.py b/tf_adapter_2.x/python/npu_device/distribute/npu_callbacks.py index a5b365d4f30d38502cbbe27e549c9c9e085930b1..f7ca858463a392144ac0bdccd8a2a2896922b74d 100644 --- a/tf_adapter_2.x/python/npu_device/distribute/npu_callbacks.py +++ b/tf_adapter_2.x/python/npu_device/distribute/npu_callbacks.py @@ -51,6 +51,14 @@ def broadcast_keras_model(model, root_rank=0): return model +def get_rank_size(): + if os.getenv("CM_WORK_SIZE") is not None and os.getenv("RANK_SIZE") is not None: + raise ValueError("RANK_SIZE and CM_WORK_SIZE cannot be configured at the same time") + rank_size = os.getenv('RANK_SIZE') if os.getenv( + "RANK_SIZE") is not None else os.getenv('CM_WORK_SIZE', '1') + return rank_size + + class NPUBroadcastGlobalVariablesCallback(keras.callbacks.Callback): """ Keras Callback that will broadcast all global variables from root rank @@ -75,7 +83,7 @@ class NPUBroadcastGlobalVariablesCallback(keras.callbacks.Callback): if self.broadcast_done: return - rank_size = os.getenv("RANK_SIZE", "1") + rank_size = get_rank_size() if int(rank_size) > 1: broadcast_helper(self.model.trainable_variables, self.root_rank)