From 33629d6aad874f4b92b0ee349519de7f5bd39c3b Mon Sep 17 00:00:00 2001 From: CLAY-panjw Date: Mon, 20 Mar 2023 22:29:13 +0800 Subject: [PATCH] =?UTF-8?q?tf2.x=E5=8E=BBranktable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../inc/external/ge/ge_api_types.h | 7 +++ .../npu_device/core/npu_wrapper.cpp | 17 +++++- .../npu_device/distribute/npu_callbacks.py | 4 +- .../python/npu_device/npu_device.py | 52 +++++++++++++++---- tf_adapter_2.x/tests/st/adapter2_st.py | 8 +-- .../tests/stub/include/stub/defines.h | 7 +++ 6 files changed, 79 insertions(+), 16 deletions(-) diff --git a/inc/graphengine/inc/external/ge/ge_api_types.h b/inc/graphengine/inc/external/ge/ge_api_types.h index 8efcef921..e2d720bef 100644 --- a/inc/graphengine/inc/external/ge/ge_api_types.h +++ b/inc/graphengine/inc/external/ge/ge_api_types.h @@ -43,6 +43,13 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode"; const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile"; const char *const GE_AICPU_FLAG = "ge.aicpuFlag"; const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath"; + +const std::string OPTION_EXEC_CM_CHIEF_IP = "ge.cmChiefIp"; +const std::string OPTION_EXEC_CM_CHIEF_PORT = "ge.cmChiefPort"; +const std::string OPTION_EXEC_CM_CHIEF_DEVICE = "ge.cmChiefWorkerDevice"; +const std::string OPTION_EXEC_CM_WORKER_IP = "ge.cmWorkerIp"; +const std::string OPTION_EXEC_CM_WORKER_SIZE = "ge.cmWorkerSize"; + // Dump flag and para const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump"; const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath"; diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index 7ffadcdd7..7e1517752 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -117,7 +117,12 @@ const std::map kConfigurableOptions = { {"graph_parallel_option_path", "ge.graphParallelOptionPath"}, {"enable_graph_parallel", "ge.enableGraphParallel"}, {"atomic_clean_policy", "ge.exec.atomicCleanPolicy"}, - {"static_memory_policy", "ge.exec.staticMemoryPolicy"}}; + {"static_memory_policy", "ge.exec.staticMemoryPolicy"}, + {"_distribute.cm_chief_ip", ge::OPTION_EXEC_CM_CHIEF_IP}, + {"_distribute.cm_chief_port", ge::OPTION_EXEC_CM_CHIEF_PORT}, + {"_distribute.cm_chief_worker_device", ge::OPTION_EXEC_CM_CHIEF_DEVICE}, + {"_distribute.cm_worker_ip", ge::OPTION_EXEC_CM_WORKER_IP}, + {"_distribute.cm_worker_size", ge::OPTION_EXEC_CM_WORKER_SIZE}}; } // namespace #undef PYBIND11_CHECK_PYTHON_VERSION @@ -147,6 +152,16 @@ void ParseGlobalOptions(int device_index, const std::map 1: + env_cm_chief_ip = os.getenv("CM_CHIEF_IP") env_rank_table = os.getenv("RANK_TABLE_FILE") - env_worker_id = os.getenv('RANK_ID') - if not env_rank_table: - raise RuntimeError('You must specify a rank table file by set env RANK_TABLE_FILE in distribution mode') - - if not env_worker_id: - raise RuntimeError('You must specify rank id by set env RANK_ID in distribution mode') - - global_kw_options['_distribute.rank_table'] = env_rank_table - global_kw_options['_distribute.rank_id'] = env_worker_id + if env_cm_chief_ip is not None and env_rank_table is not None: + raise RuntimeError('CM_CHIEF_IP and RANK_TABLE_FILE cannot be configured at the same time.') + elif env_cm_chief_ip is not None: + set_cm_chief_worksize_env(global_kw_options, env_cm_chief_ip, workers_num) + elif env_rank_table is not None: + set_rank_table_file_env(global_kw_options, env_rank_table) + else: + raise RuntimeError('CM_CHIEF_IP and RANK_TABLE_FILE are all not be configured.') device_options = {} error_message = _npu_device_backends.Open(context.context()._handle, NPU, device_id, global_kw_options, diff --git a/tf_adapter_2.x/tests/st/adapter2_st.py b/tf_adapter_2.x/tests/st/adapter2_st.py index 1b28a15cb..03663ac70 100644 --- a/tf_adapter_2.x/tests/st/adapter2_st.py +++ b/tf_adapter_2.x/tests/st/adapter2_st.py @@ -35,9 +35,11 @@ npu_device.global_options().experimental.multi_branches_config.dynamic_node_type npu_device.global_options().experimental.multi_branches_config.dynamic_dims = "1;2" npu_device.global_options().aoe_config.work_path = "./" npu_device.global_options().graph_run_mode = 0 -os.environ['RANK_TABLE_FILE'] = "rankTable" -os.environ['RANK_SIZE'] = "2" -os.environ['RANK_ID'] = "1" +os.environ['CM_CHIEF_IP'] = "1" +os.environ['CM_CHIEF_PORT'] = "3" +os.environ['CM_CHIEF_DEVICE'] = "4" +os.environ['CM_WORKER_SIZE'] = "2" +os.environ['CM_WORKER_IP'] = "123" npu = npu_device.open().as_default() npu.workers_num = 2 # mock run in 2P env diff --git a/tf_adapter_2.x/tests/stub/include/stub/defines.h b/tf_adapter_2.x/tests/stub/include/stub/defines.h index 5be2c1243..653de3c59 100644 --- a/tf_adapter_2.x/tests/stub/include/stub/defines.h +++ b/tf_adapter_2.x/tests/stub/include/stub/defines.h @@ -25,6 +25,13 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode"; const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile"; const char *const GE_AICPU_FLAG = "ge.aicpuFlag"; const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath"; + +const std::string OPTION_EXEC_CM_CHIEF_IP = "ge.cmChiefIp"; +const std::string OPTION_EXEC_CM_CHIEF_PORT = "ge.cmChiefPort"; +const std::string OPTION_EXEC_CM_CHIEF_DEVICE = "ge.cmChiefWorkerDevice"; +const std::string OPTION_EXEC_CM_WORKER_IP = "ge.cmWorkerIp"; +const std::string OPTION_EXEC_CM_WORKER_SIZE = "ge.cmWorkerSize"; + // Dump flag and para const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump"; const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath"; -- Gitee