diff --git a/test/npu/test_fault_mode.py b/test/npu/test_fault_mode.py index 6877fced8f5c4cf0831c663ccb5e81527db3e6f1..6098f52ece55b6e02eba1cead1afca90a9862129 100644 --- a/test/npu/test_fault_mode.py +++ b/test/npu/test_fault_mode.py @@ -173,7 +173,7 @@ class TestMode(TestCase): message ) self.assertIn( - "Initialize", + "SetDevice", message ) diff --git a/torch_npu/acl.json b/torch_npu/acl.json index 8a77faac4fa153432b380a418d81361586790f59..b676ee4748bdc6bbeb23c2053d260830f48d4dec 100644 --- a/torch_npu/acl.json +++ b/torch_npu/acl.json @@ -1 +1 @@ -{"dump":{"dump_scene":"lite_exception"}} \ No newline at end of file +{"dump":{"dump_scene":"lite_exception"}, "defaultDevice":{"default_device":"0"}} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 8b910303842a9829401061e3ff3072b8b58c95a4..2ee953d29a20da258ccf2f79bd3c41c9478b9f59 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -3495,7 +3495,7 @@ public: } int device = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); - LazySetDevice(); + LazySetDevice(device); void *devPtr = nullptr; void (*deleteFunc)(void *) = &local_raw_delete; diff --git a/torch_npu/csrc/core/npu/NPUEvent.cpp b/torch_npu/csrc/core/npu/NPUEvent.cpp index d956195154776cc6c972a55c9da23037be508959..df68e8d3d4abd39315e4d3e02ecd037482e0a78d 100644 --- a/torch_npu/csrc/core/npu/NPUEvent.cpp +++ b/torch_npu/csrc/core/npu/NPUEvent.cpp @@ -179,7 +179,7 @@ void NPUEvent::createEvent(c10::DeviceIndex device_index) { device_index_ = device_index; NPUGuard guard(device_index_); - LazySetDevice(); + LazySetDevice(device_index_); NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtCreateEventWithFlag(&event_, flags_)); ASCEND_LOGI("Event: aclrtCreateEventWithFlag is successfully executed, event=%p", event_); #ifndef BUILD_LIBTORCH diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index b8ce0e5c176227d471b7b0c67d4b026551db60aa..ac767df47844fc60ea225243dc710e5654797d8a 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -58,9 +58,8 @@ aclError GetDevice(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } - if (err == ACL_ERROR_NONE) { - local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { + // before call aclinit with defaultdevice + if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = 0; return ACL_ERROR_NONE; } @@ -82,9 +81,8 @@ aclError GetDeviceWithoutSet(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } - if (err == ACL_ERROR_NONE) { - local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { + // before call aclinit with defaultdevice + if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = -1; return ACL_ERROR_NONE; } @@ -266,24 +264,21 @@ bool IsContextInitialized() return true; } - int32_t device = -1; - aclError err = aclrtGetDevice(&device); - if (err == ACL_ERROR_NONE) { - return true; - } else { - CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); - if (err == ACL_ERROR_RT_CONTEXT_NULL) { - return false; - } - NPU_CHECK_ERROR_WITHOUT_UCE(err); - return false; - } + return false; } -void LazySetDevice() +void LazySetDevice(c10::DeviceIndex device) { - if (local_device < 0) { - NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(0)); + if (local_device != device) { + aclError err = aclrtSetDevice(device); + if (err == ACL_ERROR_NONE) { + local_device = device; + std::lock_guard lock(mtx); + if (used_devices.find(local_device) == used_devices.end()) { + NPU_CHECK_ERROR_WITHOUT_UCE(aclrtGetCurrentContext(&used_devices[local_device])); + } + } + NPU_CHECK_ERROR_WITHOUT_UCE(err); } } diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h index 6451efc000ad1325d0d855c8b16cfa07d5e45f2a..fe9c75b54c1352a04e56d48410628ef4936610e2 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.h +++ b/torch_npu/csrc/core/npu/NPUFunctions.h @@ -79,7 +79,7 @@ int MaybeExchangeDevice(int to_device); void SetTargetDevice(); -void LazySetDevice(); +void LazySetDevice(c10::DeviceIndex device); int GetLocalDevice(); diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index e7d0e405bfd0fa7722d4fd1f602ba2884e28e685..c8a3a56ebc4d5ba779350a94eaba80fdf277911e 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -15,6 +15,7 @@ #include "torch_npu/csrc/core/npu/NPUQueue.h" #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" +#include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h" #include "third_party/acl/inc/acl/acl_rt.h" #ifndef BUILD_LIBTORCH @@ -231,7 +232,8 @@ static void initNPUStreamsOnce() c10::DeviceIndex device_index = current_device(); // makesure on real devcie SetTargetDevice(); - LazySetDevice(); + LazySetDevice(device_index); + c10_npu::NpuSysCtrl::GetInstance().LazyInitialize(); if (!initialize_flag[device_index]) { std::lock_guard lock(mtx[device_index]); if (!initialize_flag[device_index]) { diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index d705c890ed0f6bd6533a1de0c258e27aeebd6739..a47d8be28c51b89ecaa157dabf0cb12081bfe918 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -95,7 +95,7 @@ std::string GetAclConfigJsonPath() namespace c10_npu { -NpuSysCtrl::NpuSysCtrl() : repeat_init_acl_flag_(true), init_flag_(false), device_id_(0) +NpuSysCtrl::NpuSysCtrl() : repeat_init_acl_flag_(true), init_flag_(false), lazy_init_flag_(false), device_id_(0) {} // Get NpuSysCtrl singleton instance @@ -139,14 +139,9 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) c10_npu::NPUWorkspaceAllocator::init(); ASCEND_LOGD("Npu workspace allocator initialize successfully"); c10_npu::option::OptionsManager::IsOomSnapshotEnable(); - // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed - auto ret = aclrtGetDevice(&device_id_); - if (ret != ACL_ERROR_NONE) { - device_id_ = (device_id == -1) ? 0 : device_id; - NPU_CHECK_ERROR(c10_npu::SetDevice(device_id_)); - } else { - ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); + if (device_id >= 0) { + NPU_CHECK_ERROR(c10_npu::SetDevice(device_id)); } if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { @@ -159,12 +154,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) // set global soc name c10_npu::SetSocVersion(soc_name); - if (c10_npu::IsSupportInfNan()) { - c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); - } else { - c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); - } - auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode(); if (acl_op_init_mode == 0) { at_npu::aclops::InitAclops(); @@ -177,9 +166,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) SetDefaultAllowInternalFromatDisable(); } - NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); - NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); - // lazy call for the setoption for (const auto &iter: lazy_fn_) { ASCEND_LOGD("start setoption for the lazy call."); @@ -198,6 +184,34 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) return INIT_SUCC; } +NpuSysCtrl::SysStatus NpuSysCtrl::LazyInitialize(int device_id) +{ + if (lazy_init_flag_) { + return INIT_SUCC; + } + std::lock_guard lock(lazy_init_mutex_); + if (lazy_init_flag_) { + return INIT_SUCC; + } + + // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed + auto ret = aclrtGetDevice(&device_id_); + + if (c10_npu::IsSupportInfNan()) { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); + } else { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); + } + + NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + + lazy_init_flag_ = true; + ASCEND_LOGD("Npu sys ctrl Lazyinitialize successfully."); + + return INIT_SUCC; +} + NpuSysCtrl::SysStatus NpuSysCtrl::ExchangeDevice(int device) { NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::SetDevice(device)); @@ -264,6 +278,11 @@ bool NpuSysCtrl::GetInitFlag() return init_flag_; } +bool NpuSysCtrl::GetLazyInitFlag() +{ + return lazy_init_flag_; +} + void NpuSysCtrl::RegisterLazyFn(const option::OptionCallBack &call_, const std::string &in) { lazy_fn_.emplace_back(std::make_pair(call_, in)); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h index 5d9861de07edbdfc9f7946cb88354e273ddcc5dd..362cf4289732a1503b25e543e14045bf0f1ae4ff 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h @@ -41,6 +41,8 @@ public: // Environment Initialize, return SysStatus SysStatus Initialize(int device_id = -1); + SysStatus LazyInitialize(int device_id = -1); + // Change current device from pre_device to device SysStatus ExchangeDevice(int device); @@ -56,6 +58,9 @@ public: // Get Init_flag C10_NPU_API bool GetInitFlag(); + // Get lazyInit_flag + bool GetLazyInitFlag(); + int InitializedDeviceID() { if (GetInitFlag()) { @@ -77,10 +82,12 @@ private: private: bool repeat_init_acl_flag_; bool init_flag_; + bool lazy_init_flag_; int device_id_; std::map> release_fn_; std::vector> lazy_fn_; std::mutex init_mutex_; + std::mutex lazy_init_mutex_; }; aclError SetCurrentDevice(); diff --git a/torch_npu/csrc/libs/init_npu.cpp b/torch_npu/csrc/libs/init_npu.cpp index 97c0709928a84ab4e7e19d52074ded96339077cc..bf3dc4be929e461028f0a5017cd3407d738c695a 100644 --- a/torch_npu/csrc/libs/init_npu.cpp +++ b/torch_npu/csrc/libs/init_npu.cpp @@ -23,6 +23,15 @@ void init_npu(const c10::DeviceIndex device_index) C10_NPU_SHOW_ERR_MSG(); return; } + if (!c10_npu::NpuSysCtrl::GetInstance().GetLazyInitFlag()) { + c10_npu::LazySetDevice(device_index); + c10_npu::NpuSysCtrl::SysStatus lazystatus = + c10_npu::NpuSysCtrl::GetInstance().LazyInitialize((int)device_index); + if (lazystatus != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) { + C10_NPU_SHOW_ERR_MSG(); + return; + } + } }