From 2318c181e8c420097c64b5fc36838ff98577e7dc Mon Sep 17 00:00:00 2001 From: wangchao Date: Tue, 19 Aug 2025 16:59:40 +0800 Subject: [PATCH] add OpExecuteTimeOut --- torch_npu/csrc/core/npu/register/OptionsManager.cpp | 7 +++++++ torch_npu/csrc/core/npu/register/OptionsManager.h | 1 + torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 6 +++++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 2dd7d43a792..9feacfc8282 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -185,6 +185,13 @@ int32_t OptionsManager::GetACLExecTimeout() return static_cast(envFlag); } +int32_t OptionsManager::GetOpExecuteTimeOut() +{ + char* env_val = std::getenv("OP_EXEC_TIMEOUT"); + int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : -1; + return static_cast(envFlag); +} + int32_t OptionsManager::GetACLDeviceSyncTimeout() { char* env_val = std::getenv("ACL_DEVICE_SYNC_TIMEOUT"); diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index feb33f6ca7d..a9b5c063229 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -104,6 +104,7 @@ public: static int32_t GetHCCLExecTimeout(); static int32_t GetHCCLEventTimeout(); static std::string CheckDisableDynamicPath(); + static int32_t GetOpExecuteTimeOut(); static int32_t GetACLExecTimeout(); static int32_t GetACLDeviceSyncTimeout(); static uint32_t CheckUseHcclAsyncErrorHandleEnable(); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 4b6707b8495..767af3198e4 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -177,7 +177,11 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) } NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0)); - NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut)); + int32_t timeout = c10_npu::option::OptionsManager::GetOpExecuteTimeOut(); + if (timeout == -1) { + timeout = kMaxOpExecuteTimeOut; + } + NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(timeout)); // lazy call for the setoption for (const auto &iter: lazy_fn_) { -- Gitee