diff --git a/setup.py b/setup.py index eff8b58193874e8879fc91873cd99ee0701ab5bb..9af79786cf651e88c7c9aefc053289c4eeb8a026 100644 --- a/setup.py +++ b/setup.py @@ -426,7 +426,7 @@ def get_src_py_and_dst(): recursive=True) + glob.glob( os.path.join(BASE_DIR, "torch_npu", '**/*.yaml'), recursive=True) + glob.glob( - os.path.join(BASE_DIR, "torch_npu", 'acl.json'), + os.path.join(BASE_DIR, "torch_npu", 'acl*.json'), recursive=True) + glob.glob( os.path.join(BASE_DIR, "torch_npu", 'contrib/apis_config.json'), recursive=True) diff --git a/torch_npu/acl_default.json b/torch_npu/acl_default.json new file mode 100644 index 0000000000000000000000000000000000000000..b676ee4748bdc6bbeb23c2053d260830f48d4dec --- /dev/null +++ b/torch_npu/acl_default.json @@ -0,0 +1 @@ +{"dump":{"dump_scene":"lite_exception"}, "defaultDevice":{"default_device":"0"}} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index a18a9a9a566702f9b24ee47652921f697153bd56..5a92fb95ab1b3378873161a75adda5801b2a764d 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -3523,7 +3523,7 @@ public: } int device = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); - LazySetDevice(); + LazySetDevice(device); void *devPtr = nullptr; void (*deleteFunc)(void *) = &local_raw_delete; diff --git a/torch_npu/csrc/core/npu/NPUEvent.cpp b/torch_npu/csrc/core/npu/NPUEvent.cpp index 47483ea61d923bb01ba0eea0e22affba32b1856c..74c0b10161cb3fe1098c70024c379fcce4dccb62 100644 --- a/torch_npu/csrc/core/npu/NPUEvent.cpp +++ b/torch_npu/csrc/core/npu/NPUEvent.cpp @@ -180,7 +180,7 @@ void NPUEvent::createEvent(c10::DeviceIndex device_index) { device_index_ = device_index; NPUGuard guard(device_index_); - LazySetDevice(); + LazySetDevice(device_index_); NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtCreateEventWithFlag(&event_, flags_)); ASCEND_LOGI("Event: aclrtCreateEventWithFlag is successfully executed, event=%p", event_); #ifndef BUILD_LIBTORCH diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 441069eedab8ab86cfeff33a783ce0b419caab3a..9644d0575a120e8e00028da964fa789402c48555 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -18,6 +18,16 @@ static std::unordered_map used_devices; std::recursive_mutex mtx; thread_local int targetDeviceIndex = -1; +bool is_lazy_set_device() +{ + static bool is_lazy_set = []() { + bool lazy_val = c10_npu::option::OptionsManager::LazySetDevice(); + ASCEND_LOGW("is_lazy_set_device %d", lazy_val); + return lazy_val; + }(); + return is_lazy_set; +} + c10::DeviceIndex device_count() noexcept { // initialize number of devices only once @@ -58,9 +68,13 @@ aclError GetDevice(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } - if (err == ACL_ERROR_NONE) { - local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { + if (!is_lazy_set_device()) { + if (err == ACL_ERROR_NONE) { + local_device = *device; + } + } + // before call aclinit with defaultdevice + if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = 0; return ACL_ERROR_NONE; } @@ -82,9 +96,13 @@ aclError GetDeviceWithoutSet(int32_t *device) if (err != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); } - if (err == ACL_ERROR_NONE) { - local_device = *device; - } else if (err == ACL_ERROR_RT_CONTEXT_NULL) { + if (!is_lazy_set_device()) { + if (err == ACL_ERROR_NONE) { + local_device = *device; + } + } + // before call aclinit with defaultdevice + if (err == ACL_ERROR_RT_CONTEXT_NULL) { *device = -1; return ACL_ERROR_NONE; } @@ -269,8 +287,12 @@ bool IsContextInitialized() return true; } + if (is_lazy_set_device()) { + return false; + } + int32_t device = -1; - aclError err = aclrtGetDevice(&device); + aclError err = aclrtGetDevice(&device); if (err == ACL_ERROR_NONE) { return true; } else { @@ -288,10 +310,18 @@ int GetLocalDevice() return local_device; } -void LazySetDevice() +void LazySetDevice(c10::DeviceIndex device) { - if (local_device < 0) { - NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(0)); + if (local_device != device) { + aclError err = aclrtSetDevice(device); + if (err == ACL_ERROR_NONE) { + local_device = device; + std::lock_guard lock(mtx); + if (used_devices.find(local_device) == used_devices.end()) { + NPU_CHECK_ERROR_WITHOUT_UCE(aclrtGetCurrentContext(&used_devices[local_device])); + } + } + NPU_CHECK_ERROR_WITHOUT_UCE(err); } } diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h index 45f23826808edb5d1a6d9f0b142f2dfb37c2792f..070c7f94171af963162659f0fe62b1e4006c7db5 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.h +++ b/torch_npu/csrc/core/npu/NPUFunctions.h @@ -18,6 +18,8 @@ namespace c10_npu { +bool is_lazy_set_device(); + C10_NPU_API c10::DeviceIndex device_count() noexcept; C10_NPU_API c10::DeviceIndex device_count_ensure_non_zero(); @@ -81,7 +83,7 @@ int MaybeExchangeDevice(int to_device); void SetTargetDevice(); -void LazySetDevice(); +void LazySetDevice(c10::DeviceIndex device); int GetLocalDevice(); diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index 4b92bce9578cd88b277ea818ef613caf44d4104c..e499474a6fdd7d87a858aca12bd8b35b9f67d4c3 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -15,6 +15,7 @@ #include "torch_npu/csrc/core/npu/NPUQueue.h" #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" +#include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h" #include "third_party/acl/inc/acl/acl_rt.h" #ifndef BUILD_LIBTORCH @@ -231,7 +232,8 @@ static void initNPUStreamsOnce() c10::DeviceIndex device_index = current_device(); // makesure on real devcie SetTargetDevice(); - LazySetDevice(); + LazySetDevice(device_index); + c10_npu::NpuSysCtrl::GetInstance().LazyInitialize(); if (!initialize_flag[device_index]) { std::lock_guard lock(mtx[device_index]); if (!initialize_flag[device_index]) { diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index ab0384eaa5076c0c3ce41855be4c5d0870d2419a..13454bc47e688906b27719405ef5447f6438d989 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -631,5 +631,14 @@ bool OptionsManager::IsCompactErrorOutput() return should_print; } +bool OptionsManager::LazySetDevice() +{ + static bool lazy_set = []() -> bool { + int32_t env_val = OptionsManager::GetBoolTypeOption("LAZY_SET_DEVICE", 1); + return env_val != 0; + }(); + return lazy_set; +} + } // namespace option } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index feb33f6ca7d63e79d64a3d3133ab416583d805fe..49659639fc3390679e2cd204ebafeab25fd63e8f 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -133,6 +133,7 @@ public: static bool IsOomSnapshotEnable(); static bool ShouldPrintWarning(); static bool IsCompactErrorOutput(); + static bool LazySetDevice(); private: static int GetBoolTypeOption(const char* env_str, int defaultVal = 0); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 5f348e029ca05ecc337e3e68befd3e6c44967168..1addf16176b6f5f61710b5893e5b5a1ad83d813d 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -81,7 +81,12 @@ std::string GetAclConfigJsonPath() ASCEND_LOGW("Failed to get npu path!"); return ""; } - std::string json_path = npu_path.append("torch_npu/acl.json"); + std::string json_path = ""; + if (c10_npu::is_lazy_set_device()) { + json_path = npu_path.append("torch_npu/acl_default.json"); + } else { + json_path = npu_path.append("torch_npu/acl.json"); + } std::string json_path_str = torch_npu::toolkit::profiler::Utils::RealPath(json_path); if (json_path_str == "") { ASCEND_LOGW("this path:%s is not exist!", json_path.c_str()); @@ -95,7 +100,7 @@ std::string GetAclConfigJsonPath() namespace c10_npu { -NpuSysCtrl::NpuSysCtrl() : repeat_init_acl_flag_(true), init_flag_(false), device_id_(0) {} +NpuSysCtrl::NpuSysCtrl() : repeat_init_acl_flag_(true), init_flag_(false), lazy_init_flag_(false), device_id_(0) {} // Get NpuSysCtrl singleton instance NpuSysCtrl &NpuSysCtrl::GetInstance() @@ -139,13 +144,19 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) ASCEND_LOGD("Npu workspace allocator initialize successfully"); c10_npu::option::OptionsManager::IsOomSnapshotEnable(); - // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed - auto ret = aclrtGetDevice(&device_id_); - if (ret != ACL_ERROR_NONE) { - device_id_ = (device_id == -1) ? 0 : device_id; - NPU_CHECK_ERROR(c10_npu::SetDevice(device_id_)); + if (!c10_npu::is_lazy_set_device()) { + // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed + auto ret = aclrtGetDevice(&device_id_); + if (ret != ACL_ERROR_NONE) { + device_id_ = (device_id == -1) ? 0 : device_id; + NPU_CHECK_ERROR(c10_npu::SetDevice(device_id_)); + } else { + ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); + } } else { - ASCEND_LOGW("Npu device %d has been set before global init.", device_id_); + if (device_id >= 0) { + NPU_CHECK_ERROR(c10_npu::SetDevice(device_id)); + } } if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { @@ -158,10 +169,12 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) // set global soc name c10_npu::SetSocVersion(soc_name); - if (c10_npu::IsSupportInfNan()) { - c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); - } else { - c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); + if (!c10_npu::is_lazy_set_device()) { + if (c10_npu::IsSupportInfNan()) { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); + } else { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); + } } auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode(); @@ -176,8 +189,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) SetDefaultAllowInternalFromatDisable(); } - NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); - NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + if (!c10_npu::is_lazy_set_device()) { + NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + } // lazy call for the setoption for (const auto &iter: lazy_fn_) { @@ -200,6 +215,38 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) return INIT_SUCC; } +NpuSysCtrl::SysStatus NpuSysCtrl::LazyInitialize(int device_id) +{ + if (!c10_npu::is_lazy_set_device()) { + return INIT_SUCC; + } + + if (lazy_init_flag_) { + return INIT_SUCC; + } + std::lock_guard lock(lazy_init_mutex_); + if (lazy_init_flag_) { + return INIT_SUCC; + } + + // There's no need to call c10_npu::GetDevice at the start of the process, because device 0 may not be needed + auto ret = aclrtGetDevice(&device_id_); + + if (c10_npu::IsSupportInfNan()) { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN); + } else { + c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION); + } + + NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + + lazy_init_flag_ = true; + ASCEND_LOGD("Npu sys ctrl Lazyinitialize successfully."); + + return INIT_SUCC; +} + NpuSysCtrl::SysStatus NpuSysCtrl::ExchangeDevice(int device) { NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::SetDevice(device)); @@ -266,6 +313,11 @@ bool NpuSysCtrl::GetInitFlag() return init_flag_; } +bool NpuSysCtrl::GetLazyInitFlag() +{ + return lazy_init_flag_; +} + int NpuSysCtrl::InitializedDeviceID() { if (GetInitFlag()) { diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h index 8cb1634fd65e35237c266dda432fbd8d06389f7a..e14026c7858133a3cb55e25cd1d6cdacb83eb94f 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h @@ -42,6 +42,8 @@ public: // Environment Initialize, return SysStatus SysStatus Initialize(int device_id = -1); + SysStatus LazyInitialize(int device_id = -1); + // Change current device from pre_device to device SysStatus ExchangeDevice(int device); @@ -57,6 +59,9 @@ public: // Get Init_flag C10_NPU_API bool GetInitFlag(); + // Get lazyInit_flag + bool GetLazyInitFlag(); + int InitializedDeviceID(); void RegisterLazyFn(const option::OptionCallBack &call_, const std::string &in); @@ -71,10 +76,12 @@ private: private: bool repeat_init_acl_flag_; bool init_flag_; + bool lazy_init_flag_; int device_id_; std::map> release_fn_; std::vector> lazy_fn_; std::mutex init_mutex_; + std::mutex lazy_init_mutex_; }; aclError SetCurrentDevice(); diff --git a/torch_npu/csrc/libs/init_npu.cpp b/torch_npu/csrc/libs/init_npu.cpp index bc8ca3eee901168ea912b48a42b9321d71e8b38a..2d9442231416a89a310dfaafa2d74151b00d4253 100644 --- a/torch_npu/csrc/libs/init_npu.cpp +++ b/torch_npu/csrc/libs/init_npu.cpp @@ -23,6 +23,15 @@ void init_npu(const c10::DeviceIndex device_index) C10_NPU_SHOW_ERR_MSG(); return; } + if (c10_npu::is_lazy_set_device() && !c10_npu::NpuSysCtrl::GetInstance().GetLazyInitFlag()) { + c10_npu::LazySetDevice(device_index); + c10_npu::NpuSysCtrl::SysStatus lazystatus = + c10_npu::NpuSysCtrl::GetInstance().LazyInitialize((int)device_index); + if (lazystatus != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) { + C10_NPU_SHOW_ERR_MSG(); + return; + } + } } diff --git a/torch_npu/dynamo/__init__.py b/torch_npu/dynamo/__init__.py index 95be98be633ffb47d92b380b095e7cd04173b205..1b5bb49d0a1f9ec38ca86baf3a771cafc6da8dc2 100644 --- a/torch_npu/dynamo/__init__.py +++ b/torch_npu/dynamo/__init__.py @@ -75,6 +75,7 @@ class _LazyTorchair: try: from . import torchair + os.environ["LAZY_SET_DEVICE"] = "0" except Exception as e: # In cpython, default import loader will suppress error when # find module's __spec__. So here we need to record error and