diff --git a/test/npu/test_fault_mode.py b/test/npu/test_fault_mode.py index 6877fced8f5c4cf0831c663ccb5e81527db3e6f1..6098f52ece55b6e02eba1cead1afca90a9862129 100644 --- a/test/npu/test_fault_mode.py +++ b/test/npu/test_fault_mode.py @@ -173,7 +173,7 @@ class TestMode(TestCase): message ) self.assertIn( - "Initialize", + "SetDevice", message ) diff --git a/torch_npu/acl.json b/torch_npu/acl.json index 8a77faac4fa153432b380a418d81361586790f59..b676ee4748bdc6bbeb23c2053d260830f48d4dec 100644 --- a/torch_npu/acl.json +++ b/torch_npu/acl.json @@ -1 +1 @@ -{"dump":{"dump_scene":"lite_exception"}} \ No newline at end of file +{"dump":{"dump_scene":"lite_exception"}, "defaultDevice":{"default_device":"0"}} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index c81abfd01b76539f773809cea75c26ff9603abbc..7ddfc7a650cbadab4ff5c0d8a50835e096a4d4fc 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -3484,7 +3484,7 @@ public: } int device = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); - LazySetDevice(); + LazySetDevice(device); void *devPtr = nullptr; void (*deleteFunc)(void *) = &local_raw_delete; diff --git a/torch_npu/csrc/core/npu/NPUEvent.cpp b/torch_npu/csrc/core/npu/NPUEvent.cpp index 1a7e126f36296d4d2867ea2b0e3daf35f38c6f85..04dc4e4cbb5bb471d363240bce4e7698e7370b28 100644 --- a/torch_npu/csrc/core/npu/NPUEvent.cpp +++ b/torch_npu/csrc/core/npu/NPUEvent.cpp @@ -180,7 +180,7 @@ void NPUEvent::createEvent(c10::DeviceIndex device_index) { device_index_ = device_index; NPUGuard guard(device_index_); - LazySetDevice(); + LazySetDevice(device_index_); NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtCreateEventWithFlag(&event_, flags_)); ASCEND_LOGI("Event: aclrtCreateEventWithFlag is successfully executed, event=%p", event_); #ifndef BUILD_LIBTORCH diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 441069eedab8ab86cfeff33a783ce0b419caab3a..5e6c1386fe390789079f6ef7e767564a4dcd8282 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -58,9 +58,8 @@ aclError GetDevice(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } - if (err == ACL_ERROR_NONE) { - local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { + // before call aclinit with defaultdevice + if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = 0; return ACL_ERROR_NONE; } @@ -82,9 +81,8 @@ aclError GetDeviceWithoutSet(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } - if (err == ACL_ERROR_NONE) { - local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { + // before call aclinit with defaultdevice + if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = -1; return ACL_ERROR_NONE; } @@ -269,18 +267,7 @@ bool IsContextInitialized() return true; } - int32_t device = -1; - aclError err = aclrtGetDevice(&device); - if (err == ACL_ERROR_NONE) { - return true; - } else { - CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); - if (err == ACL_ERROR_RT_CONTEXT_NULL) { - return false; - } - NPU_CHECK_ERROR_WITHOUT_UCE(err); - return false; - } + return false; } int GetLocalDevice() @@ -288,10 +275,18 @@ int GetLocalDevice() return local_device; } -void LazySetDevice() +void LazySetDevice(c10::DeviceIndex device) { - if (local_device < 0) { - NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(0)); + if (local_device != device) { + aclError err = aclrtSetDevice(device); + if (err == ACL_ERROR_NONE) { + local_device = device; + std::lock_guard lock(mtx); + if (used_devices.find(local_device) == used_devices.end()) { + NPU_CHECK_ERROR_WITHOUT_UCE(aclrtGetCurrentContext(&used_devices[local_device])); + } + } + NPU_CHECK_ERROR_WITHOUT_UCE(err); } } diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h index 45f23826808edb5d1a6d9f0b142f2dfb37c2792f..4c320c9b2a40e5fa8147a1b495f5f6036c7b3895 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.h +++ b/torch_npu/csrc/core/npu/NPUFunctions.h @@ -81,7 +81,7 @@ int MaybeExchangeDevice(int to_device); void SetTargetDevice(); -void LazySetDevice(); +void LazySetDevice(c10::DeviceIndex device); int GetLocalDevice(); diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index 551c108ce55a883d1a8141302ddb39a614b8f448..289bef3268d5fbbf594436d5c83013a980d157d0 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -15,6 +15,7 @@ #include "torch_npu/csrc/core/npu/NPUQueue.h" #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" +#include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h" #include "third_party/acl/inc/acl/acl_rt.h" #ifndef BUILD_LIBTORCH @@ -231,7 +232,8 @@ static void initNPUStreamsOnce() c10::DeviceIndex device_index = current_device(); // makesure on real devcie SetTargetDevice(); - LazySetDevice(); + LazySetDevice(device_index); + c10_npu::NpuSysCtrl::GetInstance().LazyInitialize(); if (!initialize_flag[device_index]) { std::lock_guard lock(mtx[device_index]); if (!initialize_flag[device_index]) { diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 4b6707b8495b9a89c500e06f160b42950e9ae6fb..5175ae88bb978cc47dd92eeef7d10ca7b2c94ac9 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -95,7 +95,7 @@ std::string GetAclConfigJsonPath() namespace c10_npu { -NpuSysCtrl::NpuSysCtrl() : repeat_init_acl_flag_(true), init_flag_(false), device_id_(0) {} +NpuSysCtrl::NpuSysCtrl() : repeat_init_acl_flag_(true), init_flag_(false), lazy_init_flag_(false), device_id_(0) {} // Get NpuSysCtrl singleton instance NpuSysCtrl &NpuSysCtrl::GetInstance() @@ -139,15 +139,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) ASCEND_LOGD("Npu workspace allocator initialize successfully"); c10_npu::option::OptionsManager::IsOomSnapshotEnable(); - // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed - auto ret = aclrtGetDevice(&device_id_); - if (ret != ACL_ERROR_NONE) { - device_id_ = (device_id == -1) ? 0 : device_id; - NPU_CHECK_ERROR(c10_npu::SetDevice(device_id_)); - } else { - ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); - } - if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { const char *aclConfigPath = "acl.json"; NPU_CHECK_ERROR(aclmdlSetDump(aclConfigPath)); @@ -158,12 +149,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) // set global soc name c10_npu::SetSocVersion(soc_name); - if (c10_npu::IsSupportInfNan()) { - c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); - } else { - c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); - } - auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode(); if (acl_op_init_mode == 0) { at_npu::aclops::InitAclops(); @@ -176,9 +161,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) SetDefaultAllowInternalFromatDisable(); } - NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); - NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); - // lazy call for the setoption for (const auto &iter: lazy_fn_) { ASCEND_LOGD("start setoption for the lazy call."); @@ -197,6 +179,40 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) return INIT_SUCC; } +NpuSysCtrl::SysStatus NpuSysCtrl::LazyInitialize(int device_id) +{ + if (lazy_init_flag_) { + return INIT_SUCC; + } + std::lock_guard lock(lazy_init_mutex_); + if (lazy_init_flag_) { + return INIT_SUCC; + } + + // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed + auto ret = aclrtGetDevice(&device_id_); + if (ret != ACL_ERROR_NONE) { + device_id_ = (device_id == -1) ? 0 : device_id; + NPU_CHECK_ERROR(c10_npu::SetDevice(device_id_)); + } else { + ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); + } + + if (c10_npu::IsSupportInfNan()) { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); + } else { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); + } + + NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + + lazy_init_flag_ = true; + ASCEND_LOGD("Npu sys ctrl Lazyinitialize successfully."); + + return INIT_SUCC; +} + NpuSysCtrl::SysStatus NpuSysCtrl::ExchangeDevice(int device) { NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::SetDevice(device)); @@ -263,6 +279,11 @@ bool NpuSysCtrl::GetInitFlag() return init_flag_; } +bool NpuSysCtrl::GetLazyInitFlag() +{ + return lazy_init_flag_; +} + int NpuSysCtrl::InitializedDeviceID() { if (GetInitFlag()) { diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h index 8cb1634fd65e35237c266dda432fbd8d06389f7a..e14026c7858133a3cb55e25cd1d6cdacb83eb94f 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h @@ -42,6 +42,8 @@ public: // Environment Initialize, return SysStatus SysStatus Initialize(int device_id = -1); + SysStatus LazyInitialize(int device_id = -1); + // Change current device from pre_device to device SysStatus ExchangeDevice(int device); @@ -57,6 +59,9 @@ public: // Get Init_flag C10_NPU_API bool GetInitFlag(); + // Get lazyInit_flag + bool GetLazyInitFlag(); + int InitializedDeviceID(); void RegisterLazyFn(const option::OptionCallBack &call_, const std::string &in); @@ -71,10 +76,12 @@ private: private: bool repeat_init_acl_flag_; bool init_flag_; + bool lazy_init_flag_; int device_id_; std::map> release_fn_; std::vector> lazy_fn_; std::mutex init_mutex_; + std::mutex lazy_init_mutex_; }; aclError SetCurrentDevice(); diff --git a/torch_npu/csrc/libs/init_npu.cpp b/torch_npu/csrc/libs/init_npu.cpp index bc8ca3eee901168ea912b48a42b9321d71e8b38a..b6f2399b8bbf82a1460eaf101e6f811c873dd13c 100644 --- a/torch_npu/csrc/libs/init_npu.cpp +++ b/torch_npu/csrc/libs/init_npu.cpp @@ -23,6 +23,15 @@ void init_npu(const c10::DeviceIndex device_index) C10_NPU_SHOW_ERR_MSG(); return; } + if (!c10_npu::NpuSysCtrl::GetInstance().GetLazyInitFlag()) { + c10_npu::LazySetDevice(device_index); + c10_npu::NpuSysCtrl::SysStatus lazystatus = + c10_npu::NpuSysCtrl::GetInstance().LazyInitialize((int)device_index); + if (lazystatus != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) { + C10_NPU_SHOW_ERR_MSG(); + return; + } + } }