diff --git a/inc/graphengine/inc/external/ge/ge_api_types.h b/inc/graphengine/inc/external/ge/ge_api_types.h index 8efcef921a153c5d067d46dde6b88996c3699042..e2d720bef42f733a0cb7a472b1d84d105643004a 100644 --- a/inc/graphengine/inc/external/ge/ge_api_types.h +++ b/inc/graphengine/inc/external/ge/ge_api_types.h @@ -43,6 +43,13 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode"; const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile"; const char *const GE_AICPU_FLAG = "ge.aicpuFlag"; const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath"; + +const std::string OPTION_EXEC_CM_CHIEF_IP = "ge.cmChiefIp"; +const std::string OPTION_EXEC_CM_CHIEF_PORT = "ge.cmChiefPort"; +const std::string OPTION_EXEC_CM_CHIEF_DEVICE = "ge.cmChiefWorkerDevice"; +const std::string OPTION_EXEC_CM_WORKER_IP = "ge.cmWorkerIp"; +const std::string OPTION_EXEC_CM_WORKER_SIZE = "ge.cmWorkerSize"; + // Dump flag and para const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump"; const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath"; diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index 7ffadcdd7b41d55992a34c5fab50dac8e6caabe0..7e15177529ffcabc8a843a456110df4c6f0f5c9f 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -117,7 +117,12 @@ const std::map kConfigurableOptions = { {"graph_parallel_option_path", "ge.graphParallelOptionPath"}, {"enable_graph_parallel", "ge.enableGraphParallel"}, {"atomic_clean_policy", "ge.exec.atomicCleanPolicy"}, - {"static_memory_policy", "ge.exec.staticMemoryPolicy"}}; + {"static_memory_policy", "ge.exec.staticMemoryPolicy"}, + {"_distribute.cm_chief_ip", ge::OPTION_EXEC_CM_CHIEF_IP}, + {"_distribute.cm_chief_port", ge::OPTION_EXEC_CM_CHIEF_PORT}, + {"_distribute.cm_chief_worker_device", ge::OPTION_EXEC_CM_CHIEF_DEVICE}, + {"_distribute.cm_worker_ip", ge::OPTION_EXEC_CM_WORKER_IP}, + {"_distribute.cm_worker_size", ge::OPTION_EXEC_CM_WORKER_SIZE}}; } // namespace #undef PYBIND11_CHECK_PYTHON_VERSION @@ -147,6 +152,16 @@ void ParseGlobalOptions(int device_index, const std::map 1: + env_cm_chief_ip = os.getenv("CM_CHIEF_IP") env_rank_table = os.getenv("RANK_TABLE_FILE") - env_worker_id = os.getenv('RANK_ID') - if not env_rank_table: - raise RuntimeError('You must specify a rank table file by set env RANK_TABLE_FILE in distribution mode') - - if not env_worker_id: - raise RuntimeError('You must specify rank id by set env RANK_ID in distribution mode') - - global_kw_options['_distribute.rank_table'] = env_rank_table - global_kw_options['_distribute.rank_id'] = env_worker_id + if env_cm_chief_ip is not None and env_rank_table is not None: + raise RuntimeError('CM_CHIEF_IP and RANK_TABLE_FILE cannot be configured at the same time.') + elif env_cm_chief_ip is not None: + set_cm_chief_worksize_env(global_kw_options, env_cm_chief_ip, workers_num) + elif env_rank_table is not None: + set_rank_table_file_env(global_kw_options, env_rank_table) + else: + raise RuntimeError('CM_CHIEF_IP and RANK_TABLE_FILE are all not be configured.') device_options = {} error_message = _npu_device_backends.Open(context.context()._handle, NPU, device_id, global_kw_options, diff --git a/tf_adapter_2.x/tests/st/adapter2_st.py b/tf_adapter_2.x/tests/st/adapter2_st.py index 1b28a15cba38c9c6e7bc8a7c740425f735ce74e6..03663ac704bf1ef0195b8e05a01ea9c7eb66b628 100644 --- a/tf_adapter_2.x/tests/st/adapter2_st.py +++ b/tf_adapter_2.x/tests/st/adapter2_st.py @@ -35,9 +35,11 @@ npu_device.global_options().experimental.multi_branches_config.dynamic_node_type npu_device.global_options().experimental.multi_branches_config.dynamic_dims = "1;2" npu_device.global_options().aoe_config.work_path = "./" npu_device.global_options().graph_run_mode = 0 -os.environ['RANK_TABLE_FILE'] = "rankTable" -os.environ['RANK_SIZE'] = "2" -os.environ['RANK_ID'] = "1" +os.environ['CM_CHIEF_IP'] = "1" +os.environ['CM_CHIEF_PORT'] = "3" +os.environ['CM_CHIEF_DEVICE'] = "4" +os.environ['CM_WORKER_SIZE'] = "2" +os.environ['CM_WORKER_IP'] = "123" npu = npu_device.open().as_default() npu.workers_num = 2 # mock run in 2P env diff --git a/tf_adapter_2.x/tests/stub/include/stub/defines.h b/tf_adapter_2.x/tests/stub/include/stub/defines.h index 5be2c1243507db9bf8e99b17af920d5bac39060d..653de3c59dc2b81e91fda1a7704b091bca4d4ae4 100644 --- a/tf_adapter_2.x/tests/stub/include/stub/defines.h +++ b/tf_adapter_2.x/tests/stub/include/stub/defines.h @@ -25,6 +25,13 @@ const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode"; const char *const OPTION_EXEC_RANK_TABLE_FILE = "ge.exec.rankTableFile"; const char *const GE_AICPU_FLAG = "ge.aicpuFlag"; const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath"; + +const std::string OPTION_EXEC_CM_CHIEF_IP = "ge.cmChiefIp"; +const std::string OPTION_EXEC_CM_CHIEF_PORT = "ge.cmChiefPort"; +const std::string OPTION_EXEC_CM_CHIEF_DEVICE = "ge.cmChiefWorkerDevice"; +const std::string OPTION_EXEC_CM_WORKER_IP = "ge.cmWorkerIp"; +const std::string OPTION_EXEC_CM_WORKER_SIZE = "ge.cmWorkerSize"; + // Dump flag and para const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump"; const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath";