From 37a47465dfbbba7d29a6b17db75783612fc42eec Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Mon, 11 Aug 2025 19:45:44 +0800 Subject: [PATCH 1/3] lazy setdevice --- torch_npu/acl.json | 2 +- .../csrc/core/npu/NPUCachingAllocator.cpp | 2 +- torch_npu/csrc/core/npu/NPUEvent.cpp | 2 +- torch_npu/csrc/core/npu/NPUFunctions.cpp | 37 +++++------- torch_npu/csrc/core/npu/NPUFunctions.h | 2 +- torch_npu/csrc/core/npu/NPUStream.cpp | 4 +- .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 59 +++++++++++++------ .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.h | 7 +++ torch_npu/csrc/libs/init_npu.cpp | 9 +++ 9 files changed, 79 insertions(+), 45 deletions(-) diff --git a/torch_npu/acl.json b/torch_npu/acl.json index 8a77faac4f..b676ee4748 100644 --- a/torch_npu/acl.json +++ b/torch_npu/acl.json @@ -1 +1 @@ -{"dump":{"dump_scene":"lite_exception"}} \ No newline at end of file +{"dump":{"dump_scene":"lite_exception"}, "defaultDevice":{"default_device":"0"}} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index a18a9a9a56..5a92fb95ab 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -3523,7 +3523,7 @@ public: } int device = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); - LazySetDevice(); + LazySetDevice(device); void *devPtr = nullptr; void (*deleteFunc)(void *) = &local_raw_delete; diff --git a/torch_npu/csrc/core/npu/NPUEvent.cpp b/torch_npu/csrc/core/npu/NPUEvent.cpp index 47483ea61d..74c0b10161 100644 --- a/torch_npu/csrc/core/npu/NPUEvent.cpp +++ b/torch_npu/csrc/core/npu/NPUEvent.cpp @@ -180,7 +180,7 @@ void NPUEvent::createEvent(c10::DeviceIndex device_index) { device_index_ = device_index; NPUGuard guard(device_index_); - LazySetDevice(); + LazySetDevice(device_index_); NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtCreateEventWithFlag(&event_, flags_)); ASCEND_LOGI("Event: aclrtCreateEventWithFlag is successfully executed, event=%p", event_); #ifndef BUILD_LIBTORCH diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 441069eeda..5e6c1386fe 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -58,9 +58,8 @@ aclError GetDevice(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } - if (err == ACL_ERROR_NONE) { - local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { + // before call aclinit with defaultdevice + if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = 0; return ACL_ERROR_NONE; } @@ -82,9 +81,8 @@ aclError GetDeviceWithoutSet(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } - if (err == ACL_ERROR_NONE) { - local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { + // before call aclinit with defaultdevice + if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = -1; return ACL_ERROR_NONE; } @@ -269,18 +267,7 @@ bool IsContextInitialized() return true; } - int32_t device = -1; - aclError err = aclrtGetDevice(&device); - if (err == ACL_ERROR_NONE) { - return true; - } else { - CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); - if (err == ACL_ERROR_RT_CONTEXT_NULL) { - return false; - } - NPU_CHECK_ERROR_WITHOUT_UCE(err); - return false; - } + return false; } int GetLocalDevice() @@ -288,10 +275,18 @@ int GetLocalDevice() return local_device; } -void LazySetDevice() +void LazySetDevice(c10::DeviceIndex device) { - if (local_device < 0) { - NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(0)); + if (local_device != device) { + aclError err = aclrtSetDevice(device); + if (err == ACL_ERROR_NONE) { + local_device = device; + std::lock_guard lock(mtx); + if (used_devices.find(local_device) == used_devices.end()) { + NPU_CHECK_ERROR_WITHOUT_UCE(aclrtGetCurrentContext(&used_devices[local_device])); + } + } + NPU_CHECK_ERROR_WITHOUT_UCE(err); } } diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h index 45f2382680..4c320c9b2a 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.h +++ b/torch_npu/csrc/core/npu/NPUFunctions.h @@ -81,7 +81,7 @@ int MaybeExchangeDevice(int to_device); void SetTargetDevice(); -void LazySetDevice(); +void LazySetDevice(c10::DeviceIndex device); int GetLocalDevice(); diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index 4b92bce957..e499474a6f 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -15,6 +15,7 @@ #include "torch_npu/csrc/core/npu/NPUQueue.h" #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" +#include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h" #include "third_party/acl/inc/acl/acl_rt.h" #ifndef BUILD_LIBTORCH @@ -231,7 +232,8 @@ static void initNPUStreamsOnce() c10::DeviceIndex device_index = current_device(); // makesure on real devcie SetTargetDevice(); - LazySetDevice(); + LazySetDevice(device_index); + c10_npu::NpuSysCtrl::GetInstance().LazyInitialize(); if (!initialize_flag[device_index]) { std::lock_guard lock(mtx[device_index]); if (!initialize_flag[device_index]) { diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 5f348e029c..3daa1848ae 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -95,7 +95,7 @@ std::string GetAclConfigJsonPath() namespace c10_npu { -NpuSysCtrl::NpuSysCtrl() : repeat_init_acl_flag_(true), init_flag_(false), device_id_(0) {} +NpuSysCtrl::NpuSysCtrl() : repeat_init_acl_flag_(true), init_flag_(false), lazy_init_flag_(false), device_id_(0) {} // Get NpuSysCtrl singleton instance NpuSysCtrl &NpuSysCtrl::GetInstance() @@ -139,15 +139,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) ASCEND_LOGD("Npu workspace allocator initialize successfully"); c10_npu::option::OptionsManager::IsOomSnapshotEnable(); - // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed - auto ret = aclrtGetDevice(&device_id_); - if (ret != ACL_ERROR_NONE) { - device_id_ = (device_id == -1) ? 0 : device_id; - NPU_CHECK_ERROR(c10_npu::SetDevice(device_id_)); - } else { - ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); - } - if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { const char *aclConfigPath = "acl.json"; NPU_CHECK_ERROR(aclmdlSetDump(aclConfigPath)); @@ -158,12 +149,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) // set global soc name c10_npu::SetSocVersion(soc_name); - if (c10_npu::IsSupportInfNan()) { - c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); - } else { - c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); - } - auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode(); if (acl_op_init_mode == 0) { at_npu::aclops::InitAclops(); @@ -176,9 +161,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) SetDefaultAllowInternalFromatDisable(); } - NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); - NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); - // lazy call for the setoption for (const auto &iter: lazy_fn_) { ASCEND_LOGD("start setoption for the lazy call."); @@ -200,6 +182,40 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) return INIT_SUCC; } +NpuSysCtrl::SysStatus NpuSysCtrl::LazyInitialize(int device_id) +{ + if (lazy_init_flag_) { + return INIT_SUCC; + } + std::lock_guard lock(lazy_init_mutex_); + if (lazy_init_flag_) { + return INIT_SUCC; + } + + // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed + auto ret = aclrtGetDevice(&device_id_); + if (ret != ACL_ERROR_NONE) { + device_id_ = (device_id == -1) ? 0 : device_id; + NPU_CHECK_ERROR(c10_npu::SetDevice(device_id_)); + } else { + ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); + } + + if (c10_npu::IsSupportInfNan()) { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); + } else { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); + } + + NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + + lazy_init_flag_ = true; + ASCEND_LOGD("Npu sys ctrl Lazyinitialize successfully."); + + return INIT_SUCC; +} + NpuSysCtrl::SysStatus NpuSysCtrl::ExchangeDevice(int device) { NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::SetDevice(device)); @@ -266,6 +282,11 @@ bool NpuSysCtrl::GetInitFlag() return init_flag_; } +bool NpuSysCtrl::GetLazyInitFlag() +{ + return lazy_init_flag_; +} + int NpuSysCtrl::InitializedDeviceID() { if (GetInitFlag()) { diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h index 8cb1634fd6..e14026c785 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h @@ -42,6 +42,8 @@ public: // Environment Initialize, return SysStatus SysStatus Initialize(int device_id = -1); + SysStatus LazyInitialize(int device_id = -1); + // Change current device from pre_device to device SysStatus ExchangeDevice(int device); @@ -57,6 +59,9 @@ public: // Get Init_flag C10_NPU_API bool GetInitFlag(); + // Get lazyInit_flag + bool GetLazyInitFlag(); + int InitializedDeviceID(); void RegisterLazyFn(const option::OptionCallBack &call_, const std::string &in); @@ -71,10 +76,12 @@ private: private: bool repeat_init_acl_flag_; bool init_flag_; + bool lazy_init_flag_; int device_id_; std::map> release_fn_; std::vector> lazy_fn_; std::mutex init_mutex_; + std::mutex lazy_init_mutex_; }; aclError SetCurrentDevice(); diff --git a/torch_npu/csrc/libs/init_npu.cpp b/torch_npu/csrc/libs/init_npu.cpp index bc8ca3eee9..b6f2399b8b 100644 --- a/torch_npu/csrc/libs/init_npu.cpp +++ b/torch_npu/csrc/libs/init_npu.cpp @@ -23,6 +23,15 @@ void init_npu(const c10::DeviceIndex device_index) C10_NPU_SHOW_ERR_MSG(); return; } + if (!c10_npu::NpuSysCtrl::GetInstance().GetLazyInitFlag()) { + c10_npu::LazySetDevice(device_index); + c10_npu::NpuSysCtrl::SysStatus lazystatus = + c10_npu::NpuSysCtrl::GetInstance().LazyInitialize((int)device_index); + if (lazystatus != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) { + C10_NPU_SHOW_ERR_MSG(); + return; + } + } } -- Gitee From 31ab13c220c386e2b1498703a6661b29894b00a6 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Mon, 25 Aug 2025 09:32:49 +0800 Subject: [PATCH 2/3] set the specified device --- torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 3daa1848ae..9525195bcf 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -139,6 +139,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) ASCEND_LOGD("Npu workspace allocator initialize successfully"); c10_npu::option::OptionsManager::IsOomSnapshotEnable(); + if (device_id >= 0) { + NPU_CHECK_ERROR(c10_npu::SetDevice(device_id)); + } + if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { const char *aclConfigPath = "acl.json"; NPU_CHECK_ERROR(aclmdlSetDump(aclConfigPath)); @@ -194,12 +198,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::LazyInitialize(int device_id) // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed auto ret = aclrtGetDevice(&device_id_); - if (ret != ACL_ERROR_NONE) { - device_id_ = (device_id == -1) ? 0 : device_id; - NPU_CHECK_ERROR(c10_npu::SetDevice(device_id_)); - } else { - ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); - } if (c10_npu::IsSupportInfNan()) { c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); -- Gitee From a7be2846950a491548336677c5f33cfa637289d9 Mon Sep 17 00:00:00 2001 From: huangyunlong Date: Mon, 1 Sep 2025 15:34:19 +0800 Subject: [PATCH 3/3] compatible --- setup.py | 2 +- torch_npu/acl.json | 2 +- torch_npu/acl_default.json | 1 + torch_npu/csrc/core/npu/NPUFunctions.cpp | 37 +++++++++++++++++- torch_npu/csrc/core/npu/NPUFunctions.h | 2 + .../csrc/core/npu/register/OptionsManager.cpp | 9 +++++ .../csrc/core/npu/register/OptionsManager.h | 1 + .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 39 +++++++++++++++++-- torch_npu/csrc/libs/init_npu.cpp | 2 +- torch_npu/dynamo/__init__.py | 1 + 10 files changed, 89 insertions(+), 7 deletions(-) create mode 100644 torch_npu/acl_default.json diff --git a/setup.py b/setup.py index eff8b58193..9af79786cf 100644 --- a/setup.py +++ b/setup.py @@ -426,7 +426,7 @@ def get_src_py_and_dst(): recursive=True) + glob.glob( os.path.join(BASE_DIR, "torch_npu", '**/*.yaml'), recursive=True) + glob.glob( - os.path.join(BASE_DIR, "torch_npu", 'acl.json'), + os.path.join(BASE_DIR, "torch_npu", 'acl*.json'), recursive=True) + glob.glob( os.path.join(BASE_DIR, "torch_npu", 'contrib/apis_config.json'), recursive=True) diff --git a/torch_npu/acl.json b/torch_npu/acl.json index b676ee4748..8a77faac4f 100644 --- a/torch_npu/acl.json +++ b/torch_npu/acl.json @@ -1 +1 @@ -{"dump":{"dump_scene":"lite_exception"}, "defaultDevice":{"default_device":"0"}} \ No newline at end of file +{"dump":{"dump_scene":"lite_exception"}} \ No newline at end of file diff --git a/torch_npu/acl_default.json b/torch_npu/acl_default.json new file mode 100644 index 0000000000..b676ee4748 --- /dev/null +++ b/torch_npu/acl_default.json @@ -0,0 +1 @@ +{"dump":{"dump_scene":"lite_exception"}, "defaultDevice":{"default_device":"0"}} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 5e6c1386fe..9644d0575a 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -18,6 +18,16 @@ static std::unordered_map used_devices; std::recursive_mutex mtx; thread_local int targetDeviceIndex = -1; +bool is_lazy_set_device() +{ + static bool is_lazy_set = []() { + bool lazy_val = c10_npu::option::OptionsManager::LazySetDevice(); + ASCEND_LOGW("is_lazy_set_device %d", lazy_val); + return lazy_val; + }(); + return is_lazy_set; +} + c10::DeviceIndex device_count() noexcept { // initialize number of devices only once @@ -58,6 +68,11 @@ aclError GetDevice(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } + if (!is_lazy_set_device()) { + if (err == ACL_ERROR_NONE) { + local_device = *device; + } + } // before call aclinit with defaultdevice if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = 0; @@ -81,6 +96,11 @@ aclError GetDeviceWithoutSet(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } + if (!is_lazy_set_device()) { + if (err == ACL_ERROR_NONE) { + local_device = *device; + } + } // before call aclinit with defaultdevice if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = -1; @@ -267,7 +287,22 @@ bool IsContextInitialized() return true; } - return false; + if (is_lazy_set_device()) { + return false; + } + + int32_t device = -1; + aclError err = aclrtGetDevice(&device); + if (err == ACL_ERROR_NONE) { + return true; + } else { + CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); + if (err == ACL_ERROR_RT_CONTEXT_NULL) { + return false; + } + NPU_CHECK_ERROR_WITHOUT_UCE(err); + return false; + } } int GetLocalDevice() diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h index 4c320c9b2a..070c7f9417 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.h +++ b/torch_npu/csrc/core/npu/NPUFunctions.h @@ -18,6 +18,8 @@ namespace c10_npu { +bool is_lazy_set_device(); + C10_NPU_API c10::DeviceIndex device_count() noexcept; C10_NPU_API c10::DeviceIndex device_count_ensure_non_zero(); diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index ab0384eaa5..13454bc47e 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -631,5 +631,14 @@ bool OptionsManager::IsCompactErrorOutput() return should_print; } +bool OptionsManager::LazySetDevice() +{ + static bool lazy_set = []() -> bool { + int32_t env_val = OptionsManager::GetBoolTypeOption("LAZY_SET_DEVICE", 1); + return env_val != 0; + }(); + return lazy_set; +} + } // namespace option } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index feb33f6ca7..49659639fc 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -133,6 +133,7 @@ public: static bool IsOomSnapshotEnable(); static bool ShouldPrintWarning(); static bool IsCompactErrorOutput(); + static bool LazySetDevice(); private: static int GetBoolTypeOption(const char* env_str, int defaultVal = 0); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 9525195bcf..1addf16176 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -81,7 +81,12 @@ std::string GetAclConfigJsonPath() ASCEND_LOGW("Failed to get npu path!"); return ""; } - std::string json_path = npu_path.append("torch_npu/acl.json"); + std::string json_path = ""; + if (c10_npu::is_lazy_set_device()) { + json_path = npu_path.append("torch_npu/acl_default.json"); + } else { + json_path = npu_path.append("torch_npu/acl.json"); + } std::string json_path_str = torch_npu::toolkit::profiler::Utils::RealPath(json_path); if (json_path_str == "") { ASCEND_LOGW("this path:%s is not exist!", json_path.c_str()); @@ -139,8 +144,19 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) ASCEND_LOGD("Npu workspace allocator initialize successfully"); c10_npu::option::OptionsManager::IsOomSnapshotEnable(); - if (device_id >= 0) { - NPU_CHECK_ERROR(c10_npu::SetDevice(device_id)); + if (!c10_npu::is_lazy_set_device()) { + // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed + auto ret = aclrtGetDevice(&device_id_); + if (ret != ACL_ERROR_NONE) { + device_id_ = (device_id == -1) ? 0 : device_id; + NPU_CHECK_ERROR(c10_npu::SetDevice(device_id_)); + } else { + ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); + } + } else { + if (device_id >= 0) { + NPU_CHECK_ERROR(c10_npu::SetDevice(device_id)); + } } if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { @@ -153,6 +169,14 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) // set global soc name c10_npu::SetSocVersion(soc_name); + if (!c10_npu::is_lazy_set_device()) { + if (c10_npu::IsSupportInfNan()) { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); + } else { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); + } + } + auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode(); if (acl_op_init_mode == 0) { at_npu::aclops::InitAclops(); @@ -165,6 +189,11 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) SetDefaultAllowInternalFromatDisable(); } + if (!c10_npu::is_lazy_set_device()) { + NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + } + // lazy call for the setoption for (const auto &iter: lazy_fn_) { ASCEND_LOGD("start setoption for the lazy call."); @@ -188,6 +217,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) NpuSysCtrl::SysStatus NpuSysCtrl::LazyInitialize(int device_id) { + if (!c10_npu::is_lazy_set_device()) { + return INIT_SUCC; + } + if (lazy_init_flag_) { return INIT_SUCC; } diff --git a/torch_npu/csrc/libs/init_npu.cpp b/torch_npu/csrc/libs/init_npu.cpp index b6f2399b8b..2d94422314 100644 --- a/torch_npu/csrc/libs/init_npu.cpp +++ b/torch_npu/csrc/libs/init_npu.cpp @@ -23,7 +23,7 @@ void init_npu(const c10::DeviceIndex device_index) C10_NPU_SHOW_ERR_MSG(); return; } - if (!c10_npu::NpuSysCtrl::GetInstance().GetLazyInitFlag()) { + if (c10_npu::is_lazy_set_device() && !c10_npu::NpuSysCtrl::GetInstance().GetLazyInitFlag()) { c10_npu::LazySetDevice(device_index); c10_npu::NpuSysCtrl::SysStatus lazystatus = c10_npu::NpuSysCtrl::GetInstance().LazyInitialize((int)device_index); diff --git a/torch_npu/dynamo/__init__.py b/torch_npu/dynamo/__init__.py index 95be98be63..1b5bb49d0a 100644 --- a/torch_npu/dynamo/__init__.py +++ b/torch_npu/dynamo/__init__.py @@ -75,6 +75,7 @@ class _LazyTorchair: try: from . import torchair + os.environ["LAZY_SET_DEVICE"] = "0" except Exception as e: # In cpython, default import loader will suppress error when # find module's __spec__. So here we need to record error and -- Gitee