From 5f3c879bedf8cbc37a227ed1fd08ae26557193ae Mon Sep 17 00:00:00 2001 From: chen_liqing Date: Tue, 12 Aug 2025 14:39:32 +0800 Subject: [PATCH 1/7] Revert "!23019 fix bind conf" This reverts commit 26d11697e183925e35c779557f4c9306674f14a9. --- torch_npu/csrc/core/npu/NPUAffinityController.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index 6c2d35fd95..5567c3e6e2 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -9,7 +9,6 @@ #include #include #include -#include namespace c10_npu { @@ -17,7 +16,6 @@ static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD; static pthread_t main_thread; static bool start_main_thread_bind = false; -static std::mutex core_map_mutex; using ThreadCoreMap = std::unordered_map; @@ -266,7 +264,6 @@ CoreIdRange getCoreRange(c10::DeviceIndex device_id, ThreadType type) if (cpu_affinity_mode == 0 || cpu_affinity_mode == 1) { core_range = device_ranges[device_id]; } else { - std::lock_guard lock(core_map_mutex); if (device_thread_core_maps.find(device_id) == device_thread_core_maps.end()) { device_thread_core_maps.emplace(device_id, getCpuAffinityMap(device_id, device_ranges)); } -- Gitee From 338a818af23ca2f65cb488dba134176ba866c23a Mon Sep 17 00:00:00 2001 From: chen_liqing Date: Tue, 12 Aug 2025 14:40:56 +0800 Subject: [PATCH 2/7] Revert "!22376 Set affinity optimization" This reverts commit 60f04f7fb3e065bdb09e305b550ab1a66c288b6f. --- test/torch_npu_schema.json | 3 - .../csrc/core/npu/NPUAffinityController.cpp | 124 +++++------------- .../csrc/core/npu/NPUAffinityController.h | 5 +- torch_npu/csrc/core/npu/NPUFunctions.cpp | 5 +- torch_npu/csrc/core/npu/NPUQueue.cpp | 2 - torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp | 1 - .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 2 - .../csrc/distributed/ProcessGroupHCCL.cpp | 1 - torch_npu/csrc/framework/LazyInitAclops.cpp | 9 ++ .../csrc/framework/interface/EnvVariables.cpp | 10 ++ torch_npu/csrc/npu/Module.cpp | 10 -- torch_npu/utils/__init__.py | 3 +- torch_npu/utils/_module.py | 16 ++- torch_npu/utils/affinity.py | 6 +- 14 files changed, 72 insertions(+), 125 deletions(-) diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index 133cf492d7..a273a3d9a6 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2789,9 +2789,6 @@ "torch_npu.utils.set_thread_affinity": { "signature": "(core_range: List[int] = None)" }, - "torch_npu.utils.reset_thread_affinity": { - "signature": "()" - }, "torch_npu.dynamo.torchair.scope.npu_stream_switch": { "signature": "(stream_tag: str, stream_priority: int = 0)" }, diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index 5567c3e6e2..a331439d9f 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -14,9 +14,6 @@ namespace c10_npu { static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD; -static pthread_t main_thread; -static bool start_main_thread_bind = false; - using ThreadCoreMap = std::unordered_map; static uint32_t cpu_affinity_mode; @@ -31,7 +28,8 @@ const std::unordered_map threadTypeToNameMap = { {ACL_THREAD, "acl_thread"}, {RELEASE_THREAD, "release_thread"}, {WATCHDOG_THREAD, "hccl_watchdog_t"}, - {OTHER_THREAD, "other_thread"}}; + {OTHER_THREAD, "other_thread"}, + {USER_THREAD, "user_thread"}}; CoreIdRange getCPUDefaultRange(c10::DeviceIndex device_id) { @@ -149,7 +147,7 @@ void printCoreRanges(const uint32_t mode, const std::vector &ranges oss << "Mode: " << mode << ". Core range for each device ID: "; for (size_t i = 0; i < ranges.size(); ++i) { - oss << "Device " << i << ": [" << ranges[i].start << ", " << ranges[i].end << "]"; + oss << "Device " << i << ": [" << ranges[i].start << "," << ranges[i].end << "]"; if (i != ranges.size() - 1) { oss << "; "; } else { @@ -196,18 +194,18 @@ void SetThreadType(ThreadType type) return; } if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) { - ASCEND_LOGW("Set thread name to %s failed!", threadTypeToNameMap.at(type).c_str()); + ASCEND_LOGW("Set thread name of %s failed!", threadTypeToNameMap.at(type).c_str()); } } std::string getAffinityMapAsString(c10::DeviceIndex device_id, const ThreadCoreMap &threadCoreMap) { std::ostringstream oss; - for (auto thread_type : threadTypeList) { - oss << threadTypeToNameMap.at(thread_type) << ": [" - << threadCoreMap.at(thread_type).start << ", " - << threadCoreMap.at(thread_type).end << "]"; - if (thread_type != OTHER_THREAD) { + for (auto local_thread : threadTypeList) { + oss << threadTypeToNameMap.at(local_thread) << " : [" + << threadCoreMap.at(local_thread).start << "," + << threadCoreMap.at(local_thread).end << "]"; + if (local_thread != OTHER_THREAD) { oss << "; "; } else { oss << "."; @@ -224,16 +222,16 @@ ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector(device_index); - local_thread = type; - if (local_thread == ThreadType::MAIN_THREAD) { - start_main_thread_bind = true; - } + SetThreadType(type); SetThreadAffinity(device); } @@ -308,55 +289,20 @@ void SetThreadAffinity(int core_start, int core_end) if (!needToSetThreadAffinity()) { return; } - static int core_nums = sysconf(_SC_NPROCESSORS_ONLN); - CoreIdRange core_range; - core_range.start = static_cast(std::min(core_start, core_nums)); - core_range.end = static_cast(std::min(core_end, core_nums)); + core_start = std::min(core_start, core_nums); + core_end = std::min(core_end, core_nums); local_thread = ThreadType::USER_THREAD; - if (setThreadAffinityImpl(pthread_self(), core_range)) { - ASCEND_LOGD("Set thread affinity to user-defined range %d-%d success.", core_range.start, core_range.end); - } else { - ASCEND_LOGE("Set thread affinity to user-defined range %d-%d failed.", core_range.start, core_range.end); - } -} - -void SetMainThread() -{ - main_thread = pthread_self(); -} - -bool NeedMainThreadBind() -{ - return start_main_thread_bind && (local_thread == ThreadType::MAIN_THREAD); -} - -void StartMainThreadBind(c10::DeviceIndex device_id) -{ - if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) { - return; + cpu_set_t mask; + CPU_ZERO(&mask); + for (auto i = core_start; i <= core_end; i++) { + CPU_SET(i, &mask); } - - static thread_local bool seted = false; - if (!seted) { - seted = true; - if (syscall(SYS_gettid) != getpid()) { - start_main_thread_bind = true; - - SetThreadAffinity(device_id); - - CoreIdRange core_range = getCoreRange(device_id, ThreadType::MAIN_THREAD); - if (setThreadAffinityImpl(main_thread, core_range)) { - ASCEND_LOGD("Device %d set %s affinity to %d-%d success.", - device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(), - core_range.start, core_range.end); - } else { - ASCEND_LOGE("Device %d set %s affinity to %d-%d failed.", - device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(), - core_range.start, core_range.end); - } - } + if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) { + ASCEND_LOGD("Set %s affinity to %d-%d success.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end); + } else { + ASCEND_LOGE("Set %s affinity to %d-%d failed.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end); } } diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h index e850a47b67..0ec3c4d995 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.h +++ b/torch_npu/csrc/core/npu/NPUAffinityController.h @@ -20,12 +20,9 @@ enum ThreadType { }; void SetThreadType(ThreadType type); + void SetThreadAffinity(c10::DeviceIndex device); void SetThreadAffinity(ThreadType type); void SetThreadAffinity(int core_start, int core_end); -void SetMainThread(); -bool NeedMainThreadBind(); -void StartMainThreadBind(c10::DeviceIndex device_id); - } // namespace c10_npu \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 3d146783f0..91b301e043 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -103,10 +103,7 @@ aclError SetDevice(c10::DeviceIndex device) if (local_device == device) { return ACL_ERROR_NONE; } - - if (c10_npu::NeedMainThreadBind()) { - c10_npu::SetThreadAffinity(device); - } + c10_npu::SetThreadAffinity(device); aclError err = aclrtSetDevice(device); if (err == ACL_ERROR_NONE) { diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 579514ab37..8d8a8da6a6 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -707,8 +707,6 @@ bool Repository::CheckInit() const void StartConsume(Repository *repo, c10::DeviceIndex device_id) { SetThreadType(ThreadType::ACL_THREAD); - SetThreadAffinity(device_id); - aclError ret = c10_npu::SetDevice(device_id); if (ret != 0) { C10_NPU_SHOW_ERR_MSG(); diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp index 739056a7c4..961022b302 100644 --- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp +++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp @@ -52,7 +52,6 @@ void NPUGuardImpl::setDevice(c10::Device d) const void NPUGuardImpl::uncheckedSetDevice(c10::Device d) const noexcept { - c10_npu::StartMainThreadBind(d.index()); NPU_CHECK_WARN(c10_npu::SetDevice(d.index())); } diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index d705c890ed..b3bf8185d4 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -190,8 +190,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) lazy_fn_.clear(); - SetMainThread(); - init_flag_ = true; ASCEND_LOGD("Npu sys ctrl initialize successfully."); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 3403c954be..74df9e25bf 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1718,7 +1718,6 @@ void ProcessGroupHCCL::workCleanupLoop() try { if (needSetDevice) { c10::DeviceIndex device = static_cast(work.devices_[0].index()); - c10_npu::SetThreadAffinity(device); NPU_CHECK_ERROR(c10_npu::SetDevice(device)); deviceId_ = static_cast(work.devices_[0].index()); needSetDevice = false; diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp index 5f51f9f0a5..8d12df0a31 100644 --- a/torch_npu/csrc/framework/LazyInitAclops.cpp +++ b/torch_npu/csrc/framework/LazyInitAclops.cpp @@ -4,6 +4,7 @@ #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" +#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -157,6 +158,8 @@ void SetPrecisionMode() void LazyInitAclopsCore() { + c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); + #ifndef BUILD_LIBTORCH PyThreadState *gilState = nullptr; if (PyGILState_Check()) { @@ -172,6 +175,8 @@ void LazyInitAclopsCore() PyEval_RestoreThread(gilState); } #endif + + c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); } void LazyInitAclops() @@ -193,10 +198,14 @@ void LazyInitAclops() void InitAclopsCore() { + SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); + SetPrecisionMode(); MakeCompileCacheDirAndSetOption(); GetAndSetDefaultJitCompileByAcl(); SetHF32DefaultValue(); + + SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); } void InitAclops() diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp index 577f8ef58b..d14bb46ae1 100644 --- a/torch_npu/csrc/framework/interface/EnvVariables.cpp +++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp @@ -1,5 +1,6 @@ #include #include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "third_party/acl/inc/acl/acl_mdl.h" #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h" @@ -47,6 +48,8 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) { aclmdlSetDump(val.c_str()); }) +static bool acl_op_has_init = false; + REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable") REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner) REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { @@ -58,7 +61,14 @@ REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { "Jit compile set is disabled! If you want to set, ", "please change the environment variable ACL_OP_INIT_MODE to 0 or 1.", PTA_ERROR(ErrCode::NOT_SUPPORT)); + if (!acl_op_has_init) { + c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); + } NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str())); + if (!acl_op_has_init) { + c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); + acl_op_has_init = true; + } } SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false); }) diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 9282737cbf..1f06e2d2f7 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1582,15 +1582,6 @@ PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args) } else { c10_npu::SetThreadAffinity(core_start, core_end); } - - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs) -{ - HANDLE_TH_ERRORS - c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); Py_RETURN_NONE; END_HANDLE_TH_ERRORS } @@ -1719,7 +1710,6 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr}, {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr}, {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr}, - {"_npu_reset_thread_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr}, {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr}, {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr}, {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr}, diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py index 200325e821..29adb686a3 100644 --- a/torch_npu/utils/__init__.py +++ b/torch_npu/utils/__init__.py @@ -1,5 +1,5 @@ __all__ = ["save_async", "npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter", - "set_thread_affinity", "reset_thread_affinity"] + "set_thread_affinity"] from torch_npu import _C from ._module import _apply_module_patch @@ -19,7 +19,6 @@ from .clip_grad_norm_ import _apply_clip_grad_norm_patch from ._step import add_perf_dump_patch from .flops_count import _FlopsCounter as FlopsCounter from .affinity import _set_thread_affinity as set_thread_affinity -from .affinity import _reset_thread_affinity as reset_thread_affinity # init flopcount diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py index 313736c3a1..fbe408739f 100644 --- a/torch_npu/utils/_module.py +++ b/torch_npu/utils/_module.py @@ -30,6 +30,8 @@ from torch_npu.utils.syncbatchnorm import SyncBatchNorm as sync_batch_norm from torch_npu.utils._error_code import ErrCode, pta_error origin_mpdl_iter_init = _MultiProcessingDataLoaderIter.__init__ +origin_worker_loop = worker._worker_loop +origin_pin_memory_loop = pin_memory._pin_memory_loop CONV3D_SUPPORT_FP32_SOC_PREFIX = ["Ascend910B", "Ascend910_93"] @@ -368,9 +370,17 @@ def _mpdl_iter_init(self, *args, **kwargs): torch_npu.npu.synchronize() except Exception as e: print(e) - torch_npu._C._npu_set_thread_affinity(-1, -1) origin_mpdl_iter_init(self, *args, **kwargs) - torch_npu._C._npu_reset_thread_affinity() + + +def _npu_worker_loop(*args, **kwargs): + torch_npu._C._npu_set_thread_affinity(-1, -1) + origin_worker_loop(*args, **kwargs) + + +def _npu_pin_memory_loop(*args, **kwargs): + torch_npu._C._npu_set_thread_affinity(-1, -1) + origin_pin_memory_loop(*args, **kwargs) def _parallel_apply( @@ -523,3 +533,5 @@ def _apply_module_patch(): torch.nn.parallel.DataParallel.parallel_apply = npu_parallel_apply torch.nn.parallel.data_parallel = npu_data_parallel torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init + torch.utils.data._utils.worker._worker_loop = _npu_worker_loop + torch.utils.data._utils.pin_memory._pin_memory_loop = _npu_pin_memory_loop diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py index 37973f5bc7..7728736baa 100644 --- a/torch_npu/utils/affinity.py +++ b/torch_npu/utils/affinity.py @@ -14,8 +14,4 @@ def _set_thread_affinity(core_range: List[int] = None): raise ValueError("Core range should be nonnegative." + pta_error(ErrCode.PARAM)) torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1]) else: - raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM)) - - -def _reset_thread_affinity(): - torch_npu._C._npu_reset_thread_affinity() \ No newline at end of file + raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM)) \ No newline at end of file -- Gitee From 910269319fe58e7d198b26e77ced7fd235672476 Mon Sep 17 00:00:00 2001 From: chen_liqing Date: Tue, 12 Aug 2025 14:41:09 +0800 Subject: [PATCH 3/7] Revert "!21067 Lazy getThreadAffinityInfo()" This reverts commit 679469d1571865ca604c787ee60e1c4f85e83ff6. --- torch_npu/__init__.py | 4 ++- .../csrc/core/npu/NPUAffinityController.cpp | 26 +++++-------------- .../csrc/core/npu/NPUAffinityController.h | 1 + torch_npu/csrc/npu/Module.cpp | 9 +++++++ torch_npu/utils/affinity.py | 2 +- 5 files changed, 20 insertions(+), 22 deletions(-) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index ff0450375f..2b8fc18809 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -272,4 +272,6 @@ if 'TORCH_NPU_SANITIZER' in os.environ: if hasattr(sys, 'ps1'): os.environ["TASK_QUEUE_ENABLE"] = '0' warnings.warn("On the interactive interface, the value of TASK_QUEUE_ENABLE is set to 0 by default. \ - Do not set it to 1 to prevent some unknown errors") \ No newline at end of file + Do not set it to 1 to prevent some unknown errors") + +torch_npu._C._npu_get_thread_affinity() \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index a331439d9f..c2c6e1baee 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -12,6 +12,7 @@ namespace c10_npu { +static bool has_set_affinity = false; static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD; using ThreadCoreMap = std::unordered_map; @@ -158,32 +159,23 @@ void printCoreRanges(const uint32_t mode, const std::vector &ranges ASCEND_LOGD("Read CPU affinity config: %s", oss.str().c_str()); } -bool getThreadAffinityInfo() +void GetThreadAffinityInfo() { parseCPUAffinityConf(cpu_affinity_mode, device_ranges); printCoreRanges(cpu_affinity_mode, device_ranges); - if (cpu_affinity_mode == 0) { - return false; - } - cpu_set_t mask; pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask); for (auto &range : device_ranges) { for (unsigned int i = range.start; i < range.end; i++) { if (!CPU_ISSET(i, &mask)) { ASCEND_LOGW("Thread affinity is already set."); - return false; + has_set_affinity = true; + return; } } } - return true; -} - -inline bool needToSetThreadAffinity() -{ - static bool need_to_set_affinity = getThreadAffinityInfo(); - return need_to_set_affinity; + has_set_affinity = false; } void SetThreadType(ThreadType type) @@ -244,7 +236,7 @@ ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector(device_index); @@ -286,9 +275,6 @@ void SetThreadAffinity(ThreadType type) void SetThreadAffinity(int core_start, int core_end) { - if (!needToSetThreadAffinity()) { - return; - } static int core_nums = sysconf(_SC_NPROCESSORS_ONLN); core_start = std::min(core_start, core_nums); core_end = std::min(core_end, core_nums); diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h index 0ec3c4d995..f4162517a7 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.h +++ b/torch_npu/csrc/core/npu/NPUAffinityController.h @@ -19,6 +19,7 @@ enum ThreadType { USER_THREAD = 5, // Thread responsible for user. }; +void GetThreadAffinityInfo(); void SetThreadType(ThreadType type); void SetThreadAffinity(c10::DeviceIndex device); diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 1f06e2d2f7..a85d9a5366 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1568,6 +1568,14 @@ PyObject* THNPModule_npu_get_silent_check_version(PyObject* self, PyObject* noar END_HANDLE_TH_ERRORS } +PyObject* THNPModule_npu_get_thread_affinity(PyObject* self, PyObject* noargs) +{ + HANDLE_TH_ERRORS + c10_npu::GetThreadAffinityInfo(); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS @@ -1709,6 +1717,7 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_set_call_state", (PyCFunction)THNPModule_npu_set_call_state, METH_O, nullptr}, {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr}, {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr}, + {"_npu_get_thread_affinity", (PyCFunction)THNPModule_npu_get_thread_affinity, METH_NOARGS, nullptr}, {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr}, {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr}, {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr}, diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py index 7728736baa..bfa16f45dd 100644 --- a/torch_npu/utils/affinity.py +++ b/torch_npu/utils/affinity.py @@ -11,7 +11,7 @@ def _set_thread_affinity(core_range: List[int] = None): torch_npu._C._npu_set_thread_affinity(-1, -1) elif (len(core_range) == 2): if core_range[0] < 0 or core_range[1] < 0: - raise ValueError("Core range should be nonnegative." + pta_error(ErrCode.PARAM)) + raise ValueError("Expected core core_range should be nonnegative." + pta_error(ErrCode.PARAM)) torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1]) else: raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM)) \ No newline at end of file -- Gitee From 1addf0eb7018a9445e32b166aaffe1b6253f300d Mon Sep 17 00:00:00 2001 From: chen_liqing Date: Tue, 12 Aug 2025 14:41:21 +0800 Subject: [PATCH 4/7] Revert "!21028 Change thread affinity while aclop init" This reverts commit 077eaa6d09e9dd7e3d78f172ddd79425a2a26fd8. --- torch_npu/csrc/core/npu/NPUAffinityController.cpp | 9 --------- torch_npu/csrc/core/npu/NPUAffinityController.h | 2 -- torch_npu/csrc/framework/LazyInitAclops.cpp | 9 --------- torch_npu/csrc/framework/interface/EnvVariables.cpp | 10 ---------- torch_npu/csrc/npu/Module.cpp | 6 +++++- 5 files changed, 5 insertions(+), 31 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index c2c6e1baee..6f339e9177 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -264,15 +264,6 @@ void SetThreadAffinity(c10::DeviceIndex device_id) } } -void SetThreadAffinity(ThreadType type) -{ - int device_index; - NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&device_index)); - c10::DeviceIndex device = static_cast(device_index); - SetThreadType(type); - SetThreadAffinity(device); -} - void SetThreadAffinity(int core_start, int core_end) { static int core_nums = sysconf(_SC_NPROCESSORS_ONLN); diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h index f4162517a7..6c15e8c20e 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.h +++ b/torch_npu/csrc/core/npu/NPUAffinityController.h @@ -21,9 +21,7 @@ enum ThreadType { void GetThreadAffinityInfo(); void SetThreadType(ThreadType type); - void SetThreadAffinity(c10::DeviceIndex device); -void SetThreadAffinity(ThreadType type); void SetThreadAffinity(int core_start, int core_end); } // namespace c10_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp index 8d12df0a31..5f51f9f0a5 100644 --- a/torch_npu/csrc/framework/LazyInitAclops.cpp +++ b/torch_npu/csrc/framework/LazyInitAclops.cpp @@ -4,7 +4,6 @@ #include "torch_npu/csrc/core/npu/NPUException.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" -#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -158,8 +157,6 @@ void SetPrecisionMode() void LazyInitAclopsCore() { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); - #ifndef BUILD_LIBTORCH PyThreadState *gilState = nullptr; if (PyGILState_Check()) { @@ -175,8 +172,6 @@ void LazyInitAclopsCore() PyEval_RestoreThread(gilState); } #endif - - c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); } void LazyInitAclops() @@ -198,14 +193,10 @@ void LazyInitAclops() void InitAclopsCore() { - SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); - SetPrecisionMode(); MakeCompileCacheDirAndSetOption(); GetAndSetDefaultJitCompileByAcl(); SetHF32DefaultValue(); - - SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); } void InitAclops() diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp index d14bb46ae1..577f8ef58b 100644 --- a/torch_npu/csrc/framework/interface/EnvVariables.cpp +++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp @@ -1,6 +1,5 @@ #include #include "torch_npu/csrc/core/npu/NPUException.h" -#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "third_party/acl/inc/acl/acl_mdl.h" #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h" @@ -48,8 +47,6 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) { aclmdlSetDump(val.c_str()); }) -static bool acl_op_has_init = false; - REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable") REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner) REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { @@ -61,14 +58,7 @@ REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) { "Jit compile set is disabled! If you want to set, ", "please change the environment variable ACL_OP_INIT_MODE to 0 or 1.", PTA_ERROR(ErrCode::NOT_SUPPORT)); - if (!acl_op_has_init) { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); - } NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str())); - if (!acl_op_has_init) { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD); - acl_op_has_init = true; - } } SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false); }) diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index a85d9a5366..f3563d1bdf 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1586,7 +1586,11 @@ PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args) } if (core_start == -1) { - c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD); + int device_index; + NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index)); + c10::DeviceIndex device = static_cast(device_index); + c10_npu::SetThreadType(c10_npu::ThreadType::OTHER_THREAD); + c10_npu::SetThreadAffinity(device); } else { c10_npu::SetThreadAffinity(core_start, core_end); } -- Gitee From 80e7beaf8a4463a459471bb8342123d10f85be63 Mon Sep 17 00:00:00 2001 From: chen_liqing Date: Tue, 12 Aug 2025 14:43:48 +0800 Subject: [PATCH 5/7] Revert "!20142 CPU Affinity Optimization" This reverts commit 65acf094ba8d63f51a87b2d5d95d5b208a4c364e. --- test/torch_npu_schema.json | 3 - torch_npu/__init__.py | 4 +- .../csrc/core/npu/NPUAffinityController.cpp | 480 +++++++++--------- .../csrc/core/npu/NPUAffinityController.h | 54 +- torch_npu/csrc/core/npu/NPUFunctions.cpp | 2 - torch_npu/csrc/core/npu/NPUQueue.cpp | 6 +- torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp | 1 + .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 4 + .../csrc/distributed/ProcessGroupHCCL.cpp | 4 +- torch_npu/csrc/npu/Module.cpp | 32 +- torch_npu/utils/__init__.py | 6 +- torch_npu/utils/_module.py | 19 +- torch_npu/utils/affinity.py | 17 - 13 files changed, 305 insertions(+), 327 deletions(-) delete mode 100644 torch_npu/utils/affinity.py diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json index a273a3d9a6..771e4ec984 100644 --- a/test/torch_npu_schema.json +++ b/test/torch_npu_schema.json @@ -2786,9 +2786,6 @@ "torch_npu.npu_all_gather_base_mm": { "signature": "(*args, **kwargs)" }, - "torch_npu.utils.set_thread_affinity": { - "signature": "(core_range: List[int] = None)" - }, "torch_npu.dynamo.torchair.scope.npu_stream_switch": { "signature": "(stream_tag: str, stream_priority: int = 0)" }, diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 2b8fc18809..ff0450375f 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -272,6 +272,4 @@ if 'TORCH_NPU_SANITIZER' in os.environ: if hasattr(sys, 'ps1'): os.environ["TASK_QUEUE_ENABLE"] = '0' warnings.warn("On the interactive interface, the value of TASK_QUEUE_ENABLE is set to 0 by default. \ - Do not set it to 1 to prevent some unknown errors") - -torch_npu._C._npu_get_thread_affinity() \ No newline at end of file + Do not set it to 1 to prevent some unknown errors") \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index 6f339e9177..cb7d7a928e 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -1,286 +1,294 @@ + #include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" -#include "torch_npu/csrc/core/npu/GetAffinityCPUInfo.h" -#include "torch_npu/csrc/core/npu/NpuVariables.h" #include #include #include +#include #include #include #include +#include +#include namespace c10_npu { -static bool has_set_affinity = false; -static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD; - -using ThreadCoreMap = std::unordered_map; - -static uint32_t cpu_affinity_mode; -static std::vector device_ranges; -static std::unordered_map device_thread_core_maps; - -const std::initializer_list threadTypeList = { - MAIN_THREAD, ACL_THREAD, RELEASE_THREAD, WATCHDOG_THREAD, OTHER_THREAD}; - -const std::unordered_map threadTypeToNameMap = { - {MAIN_THREAD, "main_thread"}, - {ACL_THREAD, "acl_thread"}, - {RELEASE_THREAD, "release_thread"}, - {WATCHDOG_THREAD, "hccl_watchdog_t"}, - {OTHER_THREAD, "other_thread"}, - {USER_THREAD, "user_thread"}}; - -CoreIdRange getCPUDefaultRange(c10::DeviceIndex device_id) -{ - static int core_nums = sysconf(_SC_NPROCESSORS_ONLN); - int device_nums = device_count_ensure_non_zero(); - int block_size = (core_nums > 0 && device_nums > 0) ? core_nums / device_nums : 0; - return CoreIdRange{static_cast(device_id * block_size), - static_cast((device_id + 1) * block_size - 1)}; -} - -inline bool isAllDigits(const std::string &str) -{ - if (str.empty()) { + static pthread_t mainthread_tid; + static bool has_set_affinity = false; + + const std::unordered_map threadTypeToNameMap = { + {releaseThread, "release_thread"}, + {aclThread, "acl_thread"}, + {mainThread, "main_thread"}, + {hcclCommWatchdogThread, "hcclComm_watchd"}, // thread name no more than 15 chars + {backwardThread, "backward_thread"}}; + + const std::unordered_map threadNameToTypeMap = { + {"release_thread", releaseThread}, + {"acl_thread", aclThread}, + {"main_thread", mainThread}, + {"hcclComm_watchd", hcclCommWatchdogThread}, + {"backward_thread", backwardThread}}; + + inline bool has_set_pthread_affinity() + { + unsigned int core_nums = static_cast(sysconf(_SC_NPROCESSORS_ONLN)); + + cpu_set_t mask; + pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask); + for (unsigned int i = 0; i < core_nums; i++) { + if (!CPU_ISSET(i, &mask)) { + return true; + } + } return false; } - return std::all_of(str.begin(), str.end(), [](unsigned char c) { - return std::isdigit(c); - }); -} - -void parseCPUAffinityConf(uint32_t &mode, std::vector &ranges) -{ - // init - int device_nums = device_count_ensure_non_zero(); - ranges.clear(); - ranges.resize(device_nums); - for (int i = 0; i < device_nums; ++i) { - ranges[i] = getCPUDefaultRange(i); - } - mode = 0; - const char *input = c10_npu::option::OptionsManager::GetCpuAffinityConf(); - if (input == nullptr || strlen(input) == 0) { - return; + void GetAffinityInfo() + { + mainthread_tid = pthread_self(); + has_set_affinity = has_set_pthread_affinity(); } - std::string inputStr(input); - std::istringstream stream(inputStr); - std::string option; - - std::regex pattern("npu_affine:(\\d)"); - std::smatch match; - if (std::regex_search(inputStr, match, pattern)) { - int isAffinity = std::stoi(match[1].str()); - if (isAffinity != 0) { - for (int i = 0; i < device_nums; i++) { - CoreIdRange getRange = GetAssignAffinityCPU(i); - if (getRange.start == 0 && getRange.end == 0) { - break; - } - ranges[i] = getRange; + ThreadType getCurrentThreadType() + { + char thread_name[16]; + + if (prctl(PR_GET_NAME, thread_name, 0, 0, 0) == 0) { + std::string name(thread_name); + + auto it = threadNameToTypeMap.find(name); + if (it != threadNameToTypeMap.end()) { + return it->second; } } + return ThreadType::unknownThread; } - // Handle cases where only `mode` is provided, or `mode:` without value - if (isAllDigits(inputStr)) { - mode = static_cast(std::stoi(inputStr)); - return; // Return directly, `mode` has already been processed + aclError SetThreadAffinity(coreIdRange core_range, pthread_t thread) + { + cpu_set_t mask; + CPU_ZERO(&mask); + + for (auto i = core_range.start; i <= core_range.end; i++) { + CPU_SET(i, &mask); + } + if (!pthread_setaffinity_np(thread, sizeof(mask), &mask)) { + ASCEND_LOGD("Set Thread Affinity to %d-%d", core_range.start, core_range.end); + return ACL_ERROR_NONE; + } + return ACL_ERROR_FEATURE_UNSUPPORTED; + } + + coreIdRange GetCPUDefaultRange(c10::DeviceIndex device_id) + { + int core_nums = sysconf(_SC_NPROCESSORS_ONLN); + int device_nums = device_count_ensure_non_zero(); + int block_size = (core_nums > 0 && device_nums > 0) ? (core_nums + device_nums - 1) / device_nums : 0; + return coreIdRange{static_cast(device_id * block_size), + static_cast(std::min((device_id + 1) * block_size, core_nums) - 1)}; } - // Parse each option - while (std::getline(stream, option, ',')) { - // Split `option` based on colon - size_t colonPos = option.find(':'); - if (colonPos != std::string::npos) { - std::string key = option.substr(0, colonPos); - std::string value = option.substr(colonPos + 1); - - // Process `mode` - if (key == "mode") { - if (isAllDigits(value)) { - mode = static_cast(std::stoi(value)); + + std::string GetAffinityMapAsString(const std::unordered_map &threadToCoreidMap, c10::DeviceIndex device_id) + { + std::ostringstream oss; + oss << "threadToCoreidMap plan to bind device " << static_cast(device_id) << " to " + << " [" << threadToCoreidMap.at(unknownThread).start << "," << threadToCoreidMap.at(unknownThread).end << "]、" + << " [" << threadToCoreidMap.at(mainThread).start << "," << threadToCoreidMap.at(mainThread).end << "]、" + << " [" << threadToCoreidMap.at(backwardThread).start << "," << threadToCoreidMap.at(backwardThread).end << "]、" + << " [" << threadToCoreidMap.at(aclThread).start << "," << threadToCoreidMap.at(aclThread).end << "]、" + << " [" << threadToCoreidMap.at(releaseThread).start << "," << threadToCoreidMap.at(releaseThread).end << "]、" + << " [" << threadToCoreidMap.at(hcclCommWatchdogThread).start << "," << threadToCoreidMap.at(hcclCommWatchdogThread).end << "]"; + + return oss.str(); + } + + std::unordered_map GetCpuAffinityMap(c10::DeviceIndex device_id) + { + std::unordered_map threadToCoreidMap; + std::initializer_list thread_types = {unknownThread, mainThread, backwardThread, aclThread, + releaseThread, hcclCommWatchdogThread}; + + coreIdRange current_core_range = GetCPUDefaultRange(device_id); + coreId offset = current_core_range.start; + + // calculate env2 default map + coreId core_nums = current_core_range.end - current_core_range.start; + if (core_nums < thread_types.size()) { + ASCEND_LOGW("Available core numbers (%d) are insufficient for all %zu thread types. Binding available cores to all threads.", + core_nums, thread_types.size()); + for (auto thread_type : thread_types) { + threadToCoreidMap[thread_type] = current_core_range; + } + } else { + size_t remaining_type_count = thread_types.size() - 1; + int i = 0; + for (auto thread_type : thread_types) { + if (thread_type == ThreadType::unknownThread) { + threadToCoreidMap[ThreadType::unknownThread] = coreIdRange{current_core_range.start + remaining_type_count, current_core_range.end}; } else { - ASCEND_LOGW("mode is %s, should be all digits", value.c_str()); - } - } else if (key.rfind("npu", 0) == 0) { - // Handle NPU core binding range - // The key is like 'npu:0', so skip first 3 chars. - if (isAllDigits(key.substr(3))) { - int device_id = std::stoi(key.substr(3)); // Parse NPU device ID - if (device_id < device_nums) { - size_t dashPos = value.find('-'); - if (dashPos != std::string::npos) { - std::string startStr = value.substr(0, dashPos); - std::string endStr = value.substr(dashPos + 1); - if (isAllDigits(startStr) && isAllDigits(endStr)) { - CoreId start = static_cast(std::stoi(startStr)); - CoreId end = static_cast(std::stoi(endStr)); - ranges[device_id] = {start, end}; - } else { - ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str()); - } - } else { - if (isAllDigits(value)) { - CoreId singleCore = static_cast(std::stoi(value)); - ranges[device_id] = {singleCore, singleCore}; - } else { - ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str()); - } - } - } + threadToCoreidMap[thread_type] = coreIdRange{offset + i, offset + (i++)}; } } - } else if (isAllDigits(option)) { - // If no colon and the value is a number, use it directly as `mode` - mode = static_cast(std::stoi(option)); } - } -} -void printCoreRanges(const uint32_t mode, const std::vector &ranges) -{ - std::ostringstream oss; - oss << "Mode: " << mode << ". Core range for each device ID: "; + ASCEND_LOGD("Thread affinity map for device %d: %s", device_id, GetAffinityMapAsString(threadToCoreidMap, device_id).c_str()); - for (size_t i = 0; i < ranges.size(); ++i) { - oss << "Device " << i << ": [" << ranges[i].start << "," << ranges[i].end << "]"; - if (i != ranges.size() - 1) { - oss << "; "; - } else { - oss << "."; - } + return threadToCoreidMap; } - ASCEND_LOGD("Read CPU affinity config: %s", oss.str().c_str()); -} + aclError SetThreadAffinity(c10::DeviceIndex device_id) + { + return SetThreadAffinity(device_id, getCurrentThreadType()); + } -void GetThreadAffinityInfo() -{ - parseCPUAffinityConf(cpu_affinity_mode, device_ranges); - printCoreRanges(cpu_affinity_mode, device_ranges); + void printCoreRanges(const std::vector &ranges, uint32_t mode) + { + std::ostringstream oss; + oss << "Mode: " << mode << " "; - cpu_set_t mask; - pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask); - for (auto &range : device_ranges) { - for (unsigned int i = range.start; i < range.end; i++) { - if (!CPU_ISSET(i, &mask)) { - ASCEND_LOGW("Thread affinity is already set."); - has_set_affinity = true; - return; - } + for (size_t i = 0; i < ranges.size(); ++i) { + oss << "Device " << i << " Core Range: " << ranges[i].start << " - " << ranges[i].end << " "; } + + ASCEND_LOGD("Core ranges: %s", oss.str().c_str()); } - has_set_affinity = false; -} - -void SetThreadType(ThreadType type) -{ - // Called at the start of the thread's execution to avoid frequent triggering of this function. - local_thread = type; - if (type == ThreadType::OTHER_THREAD || type == ThreadType::MAIN_THREAD) { - return; - } - if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) { - ASCEND_LOGW("Set thread name of %s failed!", threadTypeToNameMap.at(type).c_str()); - } -} - -std::string getAffinityMapAsString(c10::DeviceIndex device_id, const ThreadCoreMap &threadCoreMap) -{ - std::ostringstream oss; - for (auto local_thread : threadTypeList) { - oss << threadTypeToNameMap.at(local_thread) << " : [" - << threadCoreMap.at(local_thread).start << "," - << threadCoreMap.at(local_thread).end << "]"; - if (local_thread != OTHER_THREAD) { - oss << "; "; - } else { - oss << "."; + + bool isAllDigits(const std::string &str) + { + if (str.empty()) { + return false; } + return std::all_of(str.begin(), str.end(), [](unsigned char c) { + return std::isdigit(c); + }); } - return oss.str(); -} - -ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector &device_ranges) -{ - ThreadCoreMap threadCoreMap; - CoreIdRange range = device_ranges[device_id]; - unsigned int core_nums = range.end - range.start + 1; - if (core_nums < threadTypeList.size()) { - ASCEND_LOGW("Device %d available core numbers (%d) are insufficient for all %zu thread types and will bind available cores to all threads.", - device_id, core_nums, threadTypeList.size()); - for (auto local_thread : threadTypeList) { - threadCoreMap[local_thread] = range; + + void parseCPUAffinityConf(uint32_t &mode, std::vector &ranges) + { + const char *input = c10_npu::option::OptionsManager::GetCpuAffinityConf(); + + if (input == nullptr || strlen(input) == 0) { + mode = 0; + return; } - return threadCoreMap; - } - CoreId now = range.start; - for (auto local_thread : threadTypeList) { - if (local_thread != ThreadType::OTHER_THREAD) { - threadCoreMap[local_thread] = CoreIdRange{now, now}; - } else { - threadCoreMap[ThreadType::OTHER_THREAD] = CoreIdRange{now, range.end}; + mode = 0; + int device_nums = device_count_ensure_non_zero(); + ranges.clear(); + ranges.resize(device_nums); + + // init + for (int i = 0; i < device_nums; ++i) { + ranges[i] = GetCPUDefaultRange(i); } - now++; - } - ASCEND_LOGD("Device %d thread affinity map: %s", device_id, getAffinityMapAsString(device_id, threadCoreMap).c_str()); - return threadCoreMap; -} + std::string inputStr(input); + std::istringstream stream(inputStr); + std::string option; -void SetThreadAffinity(c10::DeviceIndex device_id) -{ - if (has_set_affinity || cpu_affinity_mode == 0 || local_thread == ThreadType::USER_THREAD) { - return; - } + // Handle cases where only `mode` is provided, or `mode:` without value + if (isAllDigits(inputStr)) { + mode = static_cast(std::stoi(inputStr)); + return; // Return directly, `mode` has already been processed + } - CoreIdRange core_range; - if (cpu_affinity_mode == 1) { - core_range = device_ranges[device_id]; - } else { - if (device_thread_core_maps.find(device_id) == device_thread_core_maps.end()) { - device_thread_core_maps.emplace(device_id, getCpuAffinityMap(device_id, device_ranges)); + // Parse each option + while (std::getline(stream, option, ',')) { + // Split `option` based on colon + size_t colonPos = option.find(':'); + if (colonPos != std::string::npos) { + std::string key = option.substr(0, colonPos); + std::string value = option.substr(colonPos + 1); + + // Process `mode` + if (key == "mode") { + if (isAllDigits(value)) { + mode = static_cast(std::stoi(value)); + } else { + ASCEND_LOGW("mode is %s, should be all digits", value.c_str()); + } + } else if (key.rfind("npu", 0) == 0) { + // Handle NPU core binding range + if (isAllDigits(key.substr(3))) { + int device_id = std::stoi(key.substr(3)); // Parse NPU device ID + if (device_id < device_nums) { + size_t dashPos = value.find('-'); + if (dashPos != std::string::npos) { + std::string startStr = value.substr(0, dashPos); + std::string endStr = value.substr(dashPos + 1); + if (isAllDigits(startStr) && isAllDigits(endStr)) { + coreId start = static_cast(std::stoi(startStr)); + coreId end = static_cast(std::stoi(endStr)); + ranges[device_id] = {start, end}; + } else { + ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str()); + } + } else { + if (isAllDigits(value)) { + coreId singleCore = static_cast(std::stoi(value)); + ranges[device_id] = {singleCore, singleCore}; + } else { + ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str()); + } + } + } + } + } + } else if (isAllDigits(option)) { + // If no colon and the value is a number, use it directly as `mode` + mode = static_cast(std::stoi(option)); + } } - core_range = device_thread_core_maps.at(device_id).at(local_thread); } - cpu_set_t mask; - CPU_ZERO(&mask); - for (auto i = core_range.start; i <= core_range.end; i++) { - CPU_SET(i, &mask); - } - if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) { - ASCEND_LOGD("Device %d set %s affinity to %d-%d success.", - device_id, threadTypeToNameMap.at(local_thread).c_str(), core_range.start, core_range.end); - } else { - ASCEND_LOGE("Device %d set %s affinity to %d-%d failed.", - device_id, threadTypeToNameMap.at(local_thread).c_str(), core_range.start, core_range.end); + aclError SetThreadAffinity(c10::DeviceIndex device_id, ThreadType current_thread_type) + { + if (has_set_affinity) { + ASCEND_LOGW("Thread affinity is already set."); + return ACL_ERROR_NONE; + } + uint32_t bind_conf; + std::vector ranges; + parseCPUAffinityConf(bind_conf, ranges); + printCoreRanges(ranges, bind_conf); + + // bind_conf=1, bind cores averagely based on device_id + if (bind_conf == 1) { + return SetThreadAffinity(ranges[device_id], pthread_self()); + } else if (bind_conf == 2) { + auto thread_core_map = GetCpuAffinityMap(device_id); + // Bind the main thread only when the dispatch phase begins (i.e., when ThreadType::backwardThread is set) + if (current_thread_type == ThreadType::backwardThread) { + SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid); + } + return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self()); + } else { + ASCEND_LOGD("Thread affinity setting is disabled."); + } + return ACL_ERROR_NONE; } -} - -void SetThreadAffinity(int core_start, int core_end) -{ - static int core_nums = sysconf(_SC_NPROCESSORS_ONLN); - core_start = std::min(core_start, core_nums); - core_end = std::min(core_end, core_nums); - local_thread = ThreadType::USER_THREAD; - - cpu_set_t mask; - CPU_ZERO(&mask); - for (auto i = core_start; i <= core_end; i++) { - CPU_SET(i, &mask); + + void SetBackwardThreadName(c10::DeviceIndex device_id) + { + static thread_local bool seted = false; + if (!seted) { + seted = true; + if (syscall(SYS_gettid) != getpid()) { + SetThreadName(ThreadType::backwardThread); + SetThreadAffinity(device_id); + } + } } - if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) { - ASCEND_LOGD("Set %s affinity to %d-%d success.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end); - } else { - ASCEND_LOGE("Set %s affinity to %d-%d failed.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end); + + void SetThreadName(ThreadType type) + { + // Ensure this is called at the start of the thread's execution to avoid frequent triggering of this function. + if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) { + ASCEND_LOGW("set thread name failed!"); + } } -} -} // namespace c10_npu \ No newline at end of file +} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h index 6c15e8c20e..f2e78b69b6 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.h +++ b/torch_npu/csrc/core/npu/NPUAffinityController.h @@ -1,27 +1,35 @@ #pragma once -#include +#include "torch_npu/csrc/core/npu/npu_log.h" namespace c10_npu { -using CoreId = unsigned int; -struct CoreIdRange { - CoreId start; - CoreId end; -}; - -enum ThreadType { - MAIN_THREAD = 0, // 1st performance hotspot, responsible for operator dispatching. - ACL_THREAD = 1, // 2rd performance hotspot in PTA, responsible for handling the task queue. - RELEASE_THREAD = 2, // Thread responsible for resource release. - WATCHDOG_THREAD = 3, // Thread responsible for HCCL communication monitoring. - OTHER_THREAD = 4, // Mostly refers to threads in PyTorch's motorized sleep thread pool, which - // are not considered in PTA. - USER_THREAD = 5, // Thread responsible for user. -}; - -void GetThreadAffinityInfo(); -void SetThreadType(ThreadType type); -void SetThreadAffinity(c10::DeviceIndex device); -void SetThreadAffinity(int core_start, int core_end); - -} // namespace c10_npu \ No newline at end of file + typedef unsigned int coreId; + + struct coreIdRange { + coreId start; + coreId end; + }; + + enum ThreadType { + unknownThread = 0, // Mostly refers to threads in PyTorch's motorized sleep thread pool, which are not considered in PTA. + mainThread = 1, // 1st performance hotspot, responsible for operator dispatching during the forward phase. + backwardThread = 2, // 2nd performance hotspot, responsible for operator dispatching during the backward phase. + aclThread = 3, // 3rd performance hotspot in PTA, responsible for handling the task queue. + releaseThread = 4, // Thread responsible for resource release. + hcclCommWatchdogThread = 5 // Thread responsible for HCCL communication monitoring. + }; + + aclError SetThreadAffinity(c10::DeviceIndex device); + aclError SetThreadAffinity(c10::DeviceIndex device, ThreadType current_thread_type); + void SetThreadName(ThreadType type); + + // The main thread of PTA, which is also the main thread of PyTorch, handles multiple phases of tasks + // (e.g., first parallel checkpoint data loading, then transitioning to forward training). + // Each phase may require different thread affinity settings. Therefore, we record the thread's TID + // to adjust its affinity later as needed. + void GetAffinityInfo(); + + // Set backwardThread Name Once + void SetBackwardThreadName(c10::DeviceIndex device_id); + +} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index 91b301e043..1ba90030fb 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -3,7 +3,6 @@ #include #include "torch_npu/csrc/core/npu/NPUFunctions.h" #include "torch_npu/csrc/core/npu/NPUStream.h" -#include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/sanitizer/NPUTrace.h" @@ -103,7 +102,6 @@ aclError SetDevice(c10::DeviceIndex device) if (local_device == device) { return ACL_ERROR_NONE; } - c10_npu::SetThreadAffinity(device); aclError err = aclrtSetDevice(device); if (err == ACL_ERROR_NONE) { diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 8d8a8da6a6..231e9a2244 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -706,7 +706,9 @@ bool Repository::CheckInit() const void StartConsume(Repository *repo, c10::DeviceIndex device_id) { - SetThreadType(ThreadType::ACL_THREAD); + SetThreadName(ThreadType::aclThread); + SetThreadAffinity(device_id); + aclError ret = c10_npu::SetDevice(device_id); if (ret != 0) { C10_NPU_SHOW_ERR_MSG(); @@ -818,7 +820,7 @@ void ReleaseQueue::PopFromReleaseQueue() void StartRelease(ReleaseQueue *releaseQue) { - SetThreadType(ThreadType::RELEASE_THREAD); + SetThreadName(ThreadType::releaseThread); SetThreadAffinity(releaseQue->GetDeviceID()); while (releaseQue->GetStatus() != RepoStatus::CAN_EXIT) { diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp index 961022b302..6ce7a03e03 100644 --- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp +++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp @@ -52,6 +52,7 @@ void NPUGuardImpl::setDevice(c10::Device d) const void NPUGuardImpl::uncheckedSetDevice(c10::Device d) const noexcept { + SetBackwardThreadName(d.index()); NPU_CHECK_WARN(c10_npu::SetDevice(d.index())); } diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index b3bf8185d4..1870de034c 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -190,6 +190,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) lazy_fn_.clear(); + GetAffinityInfo(); + + SetThreadAffinity(device_id_); + init_flag_ = true; ASCEND_LOGD("Npu sys ctrl initialize successfully."); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 74df9e25bf..f8f3047927 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -1591,8 +1591,9 @@ void ProcessGroupHCCL::heartbeatMonitor() void ProcessGroupHCCL::hcclCommWatchdog() { - c10_npu::SetThreadType(c10_npu::ThreadType::WATCHDOG_THREAD); try { + c10_npu::SetThreadName(c10_npu::ThreadType::hcclCommWatchdogThread); + VLOG(2) << "[Rank " << rank_ << "] HCCL watchdog thread started!"; if (monitorThreadEnabled_.load()) { hcclHeartbeatMonitorThread_ = std::thread(&ProcessGroupHCCL::heartbeatMonitor, this); @@ -1718,6 +1719,7 @@ void ProcessGroupHCCL::workCleanupLoop() try { if (needSetDevice) { c10::DeviceIndex device = static_cast(work.devices_[0].index()); + c10_npu::SetThreadAffinity(device); NPU_CHECK_ERROR(c10_npu::SetDevice(device)); deviceId_ = static_cast(work.devices_[0].index()); needSetDevice = false; diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index f3563d1bdf..8bf1dab99a 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1568,32 +1568,24 @@ PyObject* THNPModule_npu_get_silent_check_version(PyObject* self, PyObject* noar END_HANDLE_TH_ERRORS } -PyObject* THNPModule_npu_get_thread_affinity(PyObject* self, PyObject* noargs) +PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS - c10_npu::GetThreadAffinityInfo(); + int device_index; + NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index)); + c10::DeviceIndex device = static_cast(device_index); + c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::mainThread); Py_RETURN_NONE; END_HANDLE_TH_ERRORS } -PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args) +PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS - int core_start; - int core_end; - if (!PyArg_ParseTuple(args, "ii", &core_start, &core_end)) { - throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE)); - } - - if (core_start == -1) { - int device_index; - NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index)); - c10::DeviceIndex device = static_cast(device_index); - c10_npu::SetThreadType(c10_npu::ThreadType::OTHER_THREAD); - c10_npu::SetThreadAffinity(device); - } else { - c10_npu::SetThreadAffinity(core_start, core_end); - } + int device_index; + NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index)); + c10::DeviceIndex device = static_cast(device_index); + c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::unknownThread); Py_RETURN_NONE; END_HANDLE_TH_ERRORS } @@ -1721,8 +1713,8 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_set_call_state", (PyCFunction)THNPModule_npu_set_call_state, METH_O, nullptr}, {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr}, {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr}, - {"_npu_get_thread_affinity", (PyCFunction)THNPModule_npu_get_thread_affinity, METH_NOARGS, nullptr}, - {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr}, + {"_npu_set_threads_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_NOARGS, nullptr}, + {"_npu_reset_threads_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr}, {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr}, {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr}, {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr}, diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py index 29adb686a3..8c33a26e4b 100644 --- a/torch_npu/utils/__init__.py +++ b/torch_npu/utils/__init__.py @@ -1,6 +1,3 @@ -__all__ = ["save_async", "npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter", - "set_thread_affinity"] - from torch_npu import _C from ._module import _apply_module_patch from .tensor_methods import _add_tensor_methods @@ -18,7 +15,8 @@ from .utils import _print_error_log, _print_warn_log, _print_info_log, _should_p from .clip_grad_norm_ import _apply_clip_grad_norm_patch from ._step import add_perf_dump_patch from .flops_count import _FlopsCounter as FlopsCounter -from .affinity import _set_thread_affinity as set_thread_affinity + +__all__ = ["save_async", "npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter"] # init flopcount diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py index fbe408739f..1e575cafa3 100644 --- a/torch_npu/utils/_module.py +++ b/torch_npu/utils/_module.py @@ -17,7 +17,6 @@ from torch.nn.modules.batchnorm import _NormBase, _LazyNormBase from torch.nn.modules.module import Module from torch.nn.parallel._functions import _streams from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter -from torch.utils.data._utils import worker, pin_memory from torch._utils import _get_device_index, _get_all_device_indices, _get_available_device_type, ExceptionWrapper from torch.nn.parallel.parallel_apply import get_a_var from torch.nn.parallel.scatter_gather import gather, scatter_kwargs @@ -30,8 +29,6 @@ from torch_npu.utils.syncbatchnorm import SyncBatchNorm as sync_batch_norm from torch_npu.utils._error_code import ErrCode, pta_error origin_mpdl_iter_init = _MultiProcessingDataLoaderIter.__init__ -origin_worker_loop = worker._worker_loop -origin_pin_memory_loop = pin_memory._pin_memory_loop CONV3D_SUPPORT_FP32_SOC_PREFIX = ["Ascend910B", "Ascend910_93"] @@ -370,17 +367,9 @@ def _mpdl_iter_init(self, *args, **kwargs): torch_npu.npu.synchronize() except Exception as e: print(e) + torch_npu._C._npu_reset_threads_affinity() origin_mpdl_iter_init(self, *args, **kwargs) - - -def _npu_worker_loop(*args, **kwargs): - torch_npu._C._npu_set_thread_affinity(-1, -1) - origin_worker_loop(*args, **kwargs) - - -def _npu_pin_memory_loop(*args, **kwargs): - torch_npu._C._npu_set_thread_affinity(-1, -1) - origin_pin_memory_loop(*args, **kwargs) + torch_npu._C._npu_set_threads_affinity() def _parallel_apply( @@ -530,8 +519,6 @@ def _apply_module_patch(): torch.nn.Module.cast_weight = cast_weight torch.nn.modules.rnn.LSTM.forward = _lstm_forward torch.nn.modules.batchnorm.SyncBatchNorm.forward = _syncbn_forward + torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init torch.nn.parallel.DataParallel.parallel_apply = npu_parallel_apply torch.nn.parallel.data_parallel = npu_data_parallel - torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init - torch.utils.data._utils.worker._worker_loop = _npu_worker_loop - torch.utils.data._utils.pin_memory._pin_memory_loop = _npu_pin_memory_loop diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py deleted file mode 100644 index bfa16f45dd..0000000000 --- a/torch_npu/utils/affinity.py +++ /dev/null @@ -1,17 +0,0 @@ -__all__ = [] - -from typing import List - -import torch_npu -from torch_npu.utils._error_code import ErrCode, pta_error - - -def _set_thread_affinity(core_range: List[int] = None): - if core_range is None: - torch_npu._C._npu_set_thread_affinity(-1, -1) - elif (len(core_range) == 2): - if core_range[0] < 0 or core_range[1] < 0: - raise ValueError("Expected core core_range should be nonnegative." + pta_error(ErrCode.PARAM)) - torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1]) - else: - raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM)) \ No newline at end of file -- Gitee From aff043b6bd00b2807eb309f81fa79e34f9874a29 Mon Sep 17 00:00:00 2001 From: chen_liqing Date: Tue, 12 Aug 2025 14:53:41 +0800 Subject: [PATCH 6/7] Delete dcmi info --- third_party/dcmi/CMakeLists.txt | 1 - third_party/dcmi/inc/dcmi_interface_api.h | 26 ---- .../csrc/core/npu/GetAffinityCPUInfo.cpp | 115 ------------------ torch_npu/csrc/core/npu/GetAffinityCPUInfo.h | 10 -- .../csrc/core/npu/interface/DcmiInterface.cpp | 72 ----------- .../csrc/core/npu/interface/DcmiInterface.h | 15 --- 6 files changed, 239 deletions(-) delete mode 100644 third_party/dcmi/CMakeLists.txt delete mode 100644 third_party/dcmi/inc/dcmi_interface_api.h delete mode 100644 torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp delete mode 100644 torch_npu/csrc/core/npu/GetAffinityCPUInfo.h delete mode 100644 torch_npu/csrc/core/npu/interface/DcmiInterface.cpp delete mode 100644 torch_npu/csrc/core/npu/interface/DcmiInterface.h diff --git a/third_party/dcmi/CMakeLists.txt b/third_party/dcmi/CMakeLists.txt deleted file mode 100644 index 5a5b6ee02b..0000000000 --- a/third_party/dcmi/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -INSTALL(DIRECTORY inc/ DESTINATION include/third_party/dcmi/inc FILES_MATCHING PATTERN "*.h") \ No newline at end of file diff --git a/third_party/dcmi/inc/dcmi_interface_api.h b/third_party/dcmi/inc/dcmi_interface_api.h deleted file mode 100644 index a55402f30d..0000000000 --- a/third_party/dcmi/inc/dcmi_interface_api.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright: Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. - * Author: huawei - * Date: 2021-03-17 17:46:08 - * @LastEditors: huawei - * @LastEditTime: 2022-11-03 11:17:04 - * Description: DCMI API Reference - */ - -/***************************************************************************************/ - -#ifdef __linux -#define DCMIDLLEXPORT -#else -#define DCMIDLLEXPORT _declspec(dllexport) -#endif - -#define TOPO_INFO_MAX_LENTH 32 // topo info max length - -DCMIDLLEXPORT int dcmi_init(void); - -DCMIDLLEXPORT int dcmi_get_card_num_list(int *card_num, int *card_list, int list_len); // card_num is the number of device. - -DCMIDLLEXPORT int dcmi_get_affinity_cpu_info_by_device_id(int card_id, int device_id, char *affinity_cpu, int *length); // card_id is the ID of NPU card. - -DCMIDLLEXPORT int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp deleted file mode 100644 index a88e327560..0000000000 --- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp +++ /dev/null @@ -1,115 +0,0 @@ -#include -#include "torch_npu/csrc/core/npu/interface/DcmiInterface.h" -#include "torch_npu/csrc/core/npu/NPUException.h" -#include "torch_npu/csrc/core/npu/NPUAffinityController.h" - -constexpr int NPU_OK = 0; - -static int DcmiInit() -{ - int ret = c10_npu::dcmi::DcmiInit(); - if (ret != NPU_OK) { - TORCH_CHECK(false, "Failed to init dcmi. ", PTA_ERROR(ErrCode::INTERNAL)); - } - return ret; -} - -std::string GetAffinityCPUBaseInfo(int card_id) -{ - int ret = DcmiInit(); - int device_id = 0; - int device_id_max = 0; - int mcu_id = 0; - int cpu_id = 0; - ret = c10_npu::dcmi::DcmiGetDeviceIdInCard(card_id, &device_id_max, &mcu_id, &cpu_id); - if (ret != NPU_OK) { - TORCH_NPU_WARN_ONCE("dcmi_get_device_id_in_card is not supported. " - "The npu_affine configuration of CPU_AFFINITY_CONF will be disabled."); - return ""; - } - device_id = std::max(0, device_id_max - 1); - char affinity_cpu[TOPO_INFO_MAX_LENTH] = {0}; - int length = 0; - ret = c10_npu::dcmi::DcmiGetAffinityCpuInfoByDeviceId(card_id, device_id, affinity_cpu, &length); - if (ret == NPU_OK) { - return affinity_cpu; - } - TORCH_NPU_WARN_ONCE("dcmi_get_affinity_cpu_info_by_device_id is not supported. " - "The npu_affine configuration of CPU_AFFINITY_CONF will be disabled."); - return ""; -} - -std::unordered_map CardIdAffinityCPU; - -c10_npu::CoreIdRange parseAffinityCPU(const std::string cpuString) -{ - size_t pos = cpuString.find("-"); - if (pos != std::string::npos) { - std::string start = cpuString.substr(0, pos); - std::string end = cpuString.substr(pos + 1); - int startNum = stoi(start); - int endNum = stoi(end); - if (startNum < endNum) { - return c10_npu::CoreIdRange{startNum, endNum}; - } - } - TORCH_CHECK(false, "affinity cpu " + cpuString + " is error ", PTA_ERROR(ErrCode::VALUE)); -} - -void GetExclusiveAffinityCPU() -{ - int ret = DcmiInit(); - int device_count = 0; - int card_id_list[16]; - int list_len = 16; - ret = c10_npu::dcmi::DcmiGetCardNumList(&device_count, card_id_list, list_len); - std::unordered_map SameAffinityCpuNum; - std::map CardIdAffinityCpuDefault; - for (int i = 0; i < device_count; i++) { - std::string affinity_cpu = GetAffinityCPUBaseInfo(i); - if (affinity_cpu.empty()) { - return; - } - CardIdAffinityCpuDefault[i] = affinity_cpu; - auto it = SameAffinityCpuNum.find(affinity_cpu); - if (it != SameAffinityCpuNum.end()) { - SameAffinityCpuNum[affinity_cpu] = it->second + 1; - } else { - SameAffinityCpuNum[affinity_cpu] = 1; - } - } - std::unordered_map offsetMap; - for (const auto& it : CardIdAffinityCpuDefault) { - int card_id = it.first; - std::string affinity_cpu = it.second; - int same_num = 1; - auto find_same_affinity_cpu = SameAffinityCpuNum.find(affinity_cpu); - if (find_same_affinity_cpu != SameAffinityCpuNum.end()) { - same_num = find_same_affinity_cpu->second; - } - int offset = 0; - auto find_offset = offsetMap.find(affinity_cpu); - if (find_offset != offsetMap.end()) { - offset = find_offset->second; - } - c10_npu::CoreIdRange cpu_range = parseAffinityCPU(affinity_cpu); - unsigned int length = (cpu_range.end - cpu_range.start + 1) / static_cast(same_num); - c10_npu::CoreIdRange exclusiveAffinityCpu = { - cpu_range.start + static_cast(offset) * length, - (cpu_range.start + length - 1) + static_cast(offset) * length}; - offsetMap[affinity_cpu] = offset + 1; - CardIdAffinityCPU[card_id] = exclusiveAffinityCpu; - } -} - -c10_npu::CoreIdRange GetAssignAffinityCPU(int card_id) -{ - GetExclusiveAffinityCPU(); - if (CardIdAffinityCPU.empty()) { - return {0, 0}; - } - auto it = CardIdAffinityCPU.find(card_id); - if (it != CardIdAffinityCPU.end()) { - return it->second; - } -} diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.h b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.h deleted file mode 100644 index a0ce50a201..0000000000 --- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef THNP_GETAFFINITY_INC -#define THNP_GETAFFINITY_INC -#include - -std::string GetAffinityCPUBaseInfo(int card_id); -c10_npu::CoreIdRange parseAffinityCPU(const std::string cpuString); -void GetExclusiveAffinityCPU(); -c10_npu::CoreIdRange GetAssignAffinityCPU(int card_id); - -#endif \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/interface/DcmiInterface.cpp b/torch_npu/csrc/core/npu/interface/DcmiInterface.cpp deleted file mode 100644 index ab66dae274..0000000000 --- a/torch_npu/csrc/core/npu/interface/DcmiInterface.cpp +++ /dev/null @@ -1,72 +0,0 @@ -#include -#include "torch_npu/csrc/core/npu/register/FunctionLoader.h" -#include "torch_npu/csrc/core/npu/NPUException.h" -#include "torch_npu/csrc/core/npu/interface/DcmiInterface.h" - -namespace c10_npu { -namespace dcmi { - -#undef LOAD_FUNCTION -#define LOAD_FUNCTION(funcName) \ - REGISTER_FUNCTION(libdcmi, funcName) -#undef GET_FUNC -#define GET_FUNC(funcName) \ - GET_FUNCTION(libdcmi, funcName) - -REGISTER_LIBRARY(libdcmi) -LOAD_FUNCTION(dcmi_get_affinity_cpu_info_by_device_id) -LOAD_FUNCTION(dcmi_init) -LOAD_FUNCTION(dcmi_get_device_id_in_card) -LOAD_FUNCTION(dcmi_get_card_num_list) - -int DcmiInit(void) -{ - using dcmiInitFunc = int(*)(void); - static dcmiInitFunc func = nullptr; - func = (dcmiInitFunc)GET_FUNC(dcmi_init); - if (func == nullptr) { - TORCH_CHECK(false, "Failed to find function dcmi_init, " - " maybe your hdk version is too low, please upgrade it.", PTA_ERROR(ErrCode::NOT_FOUND)) - } - return func(); -} - -int DcmiGetCardNumList(int *card_num, int *card_list, int list_len) -{ - using dcmiGetCardNumListFunc = int(*)(int *, int *, int); - static dcmiGetCardNumListFunc func = nullptr; - func = (dcmiGetCardNumListFunc)GET_FUNC(dcmi_get_card_num_list); - if (func == nullptr) { - TORCH_CHECK(false, "Failed to find function dcmi_get_card_num_list, " - " maybe your hdk version is too low, please upgrade it.", PTA_ERROR(ErrCode::NOT_FOUND)) - } - return func(card_num, card_list, list_len); -} - -int DcmiGetAffinityCpuInfoByDeviceId(int card_id, int device_id, char *affinity_cpu, int *length) -{ - using dcmiGetAffinityCpuInfoByDeviceIdFunc = int(*)(int, int, char *, int *); - static dcmiGetAffinityCpuInfoByDeviceIdFunc func = nullptr; - func = (dcmiGetAffinityCpuInfoByDeviceIdFunc)GET_FUNC(dcmi_get_affinity_cpu_info_by_device_id); - if (func == nullptr) { - TORCH_CHECK(false, "Failed to find function dcmi_get_affinity_cpu_info_by_device_id, " - " maybe your hdk version is too low, please upgrade it", PTA_ERROR(ErrCode::NOT_FOUND)); - } - return func(card_id, device_id, affinity_cpu, length); -} - -int DcmiGetDeviceIdInCard(int card_id, int *device_id_max, int *mcu_id, int *cpu_id) -{ - using dcmiGetDeviceIdInCardFunc = int(*)(int, int *, int *, int *); - static dcmiGetDeviceIdInCardFunc func = nullptr; - func = (dcmiGetDeviceIdInCardFunc)GET_FUNC(dcmi_get_device_id_in_card); - if (func == nullptr) { - TORCH_CHECK(false, "Failed to find function dcmi_get_device_id_in_card, " - " maybe your hdk version is too low, please upgrade it", PTA_ERROR(ErrCode::NOT_FOUND)) - } - return func(card_id, device_id_max, mcu_id, cpu_id); -} - -} - -} \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/interface/DcmiInterface.h b/torch_npu/csrc/core/npu/interface/DcmiInterface.h deleted file mode 100644 index 0388e32d32..0000000000 --- a/torch_npu/csrc/core/npu/interface/DcmiInterface.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include "third_party/dcmi/inc/dcmi_interface_api.h" - -namespace c10_npu { -namespace dcmi { - -int DcmiInit(void); -int DcmiGetCardNumList(int *card_num, int *card_list, int list_len); -int DcmiGetAffinityCpuInfoByDeviceId(int card_id, int device_id, char *affinity_cpu, int *length); -int DcmiGetDeviceIdInCard(int card_id, int *device_id_max, int *mcu_id, int *cpu_id); - -} - -} \ No newline at end of file -- Gitee From d8b28eb8aa478d82aff3a65e9bbfcb970db57dfe Mon Sep 17 00:00:00 2001 From: chen_liqing Date: Tue, 12 Aug 2025 15:15:12 +0800 Subject: [PATCH 7/7] Add debug info --- torch_npu/csrc/core/npu/NPUAffinityController.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp index cb7d7a928e..d1a79ae2dc 100644 --- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp +++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp @@ -257,6 +257,8 @@ namespace c10_npu { // bind_conf=1, bind cores averagely based on device_id if (bind_conf == 1) { + ASCEND_LOGD("Device %d set %s affinity to %d-%d success.", + device_id, threadTypeToNameMap.at(current_thread_type).c_str(), ranges[device_id].start, ranges[device_id].end); return SetThreadAffinity(ranges[device_id], pthread_self()); } else if (bind_conf == 2) { auto thread_core_map = GetCpuAffinityMap(device_id); -- Gitee