From 5f3c879bedf8cbc37a227ed1fd08ae26557193ae Mon Sep 17 00:00:00 2001
From: chen_liqing <jiangyiwen5@huawei.com>
Date: Tue, 12 Aug 2025 14:39:32 +0800
Subject: [PATCH 1/7] Revert "!23019 fix bind conf"

This reverts commit 26d11697e183925e35c779557f4c9306674f14a9.
---
 torch_npu/csrc/core/npu/NPUAffinityController.cpp | 3 ---
 1 file changed, 3 deletions(-)
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index 6c2d35fd95..5567c3e6e2 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -9,7 +9,6 @@
 #include <sys/prctl.h>
 #include <string>
 #include <unordered_map>
-#include <mutex>
 
 namespace c10_npu {
 
@@ -17,7 +16,6 @@ static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD;
 
 static pthread_t main_thread;
 static bool start_main_thread_bind = false;
-static std::mutex core_map_mutex;
 
 using ThreadCoreMap = std::unordered_map<ThreadType, CoreIdRange>;
 
@@ -266,7 +264,6 @@ CoreIdRange getCoreRange(c10::DeviceIndex device_id, ThreadType type)
     if (cpu_affinity_mode == 0 || cpu_affinity_mode == 1) {
         core_range = device_ranges[device_id];
     } else {
-        std::lock_guard<std::mutex> lock(core_map_mutex);
         if (device_thread_core_maps.find(device_id) == device_thread_core_maps.end()) {
             device_thread_core_maps.emplace(device_id, getCpuAffinityMap(device_id, device_ranges));
         }
-- 
Gitee


From 338a818af23ca2f65cb488dba134176ba866c23a Mon Sep 17 00:00:00 2001
From: chen_liqing <jiangyiwen5@huawei.com>
Date: Tue, 12 Aug 2025 14:40:56 +0800
Subject: [PATCH 2/7] Revert "!22376 Set affinity optimization"

This reverts commit 60f04f7fb3e065bdb09e305b550ab1a66c288b6f.
---
 test/torch_npu_schema.json                    |   3 -
 .../csrc/core/npu/NPUAffinityController.cpp   | 124 +++++-------------
 .../csrc/core/npu/NPUAffinityController.h     |   5 +-
 torch_npu/csrc/core/npu/NPUFunctions.cpp      |   5 +-
 torch_npu/csrc/core/npu/NPUQueue.cpp          |   2 -
 torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp |   1 -
 .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp   |   2 -
 .../csrc/distributed/ProcessGroupHCCL.cpp     |   1 -
 torch_npu/csrc/framework/LazyInitAclops.cpp   |   9 ++
 .../csrc/framework/interface/EnvVariables.cpp |  10 ++
 torch_npu/csrc/npu/Module.cpp                 |  10 --
 torch_npu/utils/__init__.py                   |   3 +-
 torch_npu/utils/_module.py                    |  16 ++-
 torch_npu/utils/affinity.py                   |   6 +-
 14 files changed, 72 insertions(+), 125 deletions(-)

diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index 133cf492d7..a273a3d9a6 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -2789,9 +2789,6 @@
   "torch_npu.utils.set_thread_affinity": {
     "signature": "(core_range: List[int] = None)"
   },
-  "torch_npu.utils.reset_thread_affinity": {
-    "signature": "()"
-  },
   "torch_npu.dynamo.torchair.scope.npu_stream_switch": {
     "signature": "(stream_tag: str, stream_priority: int = 0)"
   },
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index 5567c3e6e2..a331439d9f 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -14,9 +14,6 @@ namespace c10_npu {
 
 static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD;
 
-static pthread_t main_thread;
-static bool start_main_thread_bind = false;
-
 using ThreadCoreMap = std::unordered_map<ThreadType, CoreIdRange>;
 
 static uint32_t cpu_affinity_mode;
@@ -31,7 +28,8 @@ const std::unordered_map<ThreadType, std::string> threadTypeToNameMap = {
     {ACL_THREAD,        "acl_thread"},
     {RELEASE_THREAD,    "release_thread"},
     {WATCHDOG_THREAD,   "hccl_watchdog_t"},
-    {OTHER_THREAD,      "other_thread"}};
+    {OTHER_THREAD,      "other_thread"},
+    {USER_THREAD,       "user_thread"}};
 
 CoreIdRange getCPUDefaultRange(c10::DeviceIndex device_id)
 {
@@ -149,7 +147,7 @@ void printCoreRanges(const uint32_t mode, const std::vector<CoreIdRange> &ranges
     oss << "Mode: " << mode << ". Core range for each device ID: ";
 
     for (size_t i = 0; i < ranges.size(); ++i) {
-        oss << "Device " << i << ": [" << ranges[i].start << ", " << ranges[i].end << "]";
+        oss << "Device " << i << ": [" << ranges[i].start << "," << ranges[i].end << "]";
         if (i != ranges.size() - 1) {
             oss << "; ";
         } else {
@@ -196,18 +194,18 @@ void SetThreadType(ThreadType type)
         return;
     }
     if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) {
-        ASCEND_LOGW("Set thread name to %s failed!", threadTypeToNameMap.at(type).c_str());
+        ASCEND_LOGW("Set thread name of %s failed!", threadTypeToNameMap.at(type).c_str());
     }
 }
 
 std::string getAffinityMapAsString(c10::DeviceIndex device_id, const ThreadCoreMap &threadCoreMap)
 {
     std::ostringstream oss;
-    for (auto thread_type : threadTypeList) {
-        oss << threadTypeToNameMap.at(thread_type) << ": ["
-            << threadCoreMap.at(thread_type).start << ", "
-            << threadCoreMap.at(thread_type).end << "]";
-        if (thread_type != OTHER_THREAD) {
+    for (auto local_thread : threadTypeList) {
+        oss << threadTypeToNameMap.at(local_thread) << " : ["
+            << threadCoreMap.at(local_thread).start << ","
+            << threadCoreMap.at(local_thread).end << "]";
+        if (local_thread != OTHER_THREAD) {
             oss << "; ";
         } else {
             oss << ".";
@@ -224,16 +222,16 @@ ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector<Co
     if (core_nums < threadTypeList.size()) {
         ASCEND_LOGW("Device %d available core numbers (%d) are insufficient for all %zu thread types and will bind available cores to all threads.",
                     device_id, core_nums, threadTypeList.size());
-        for (auto thread_type : threadTypeList) {
-            threadCoreMap[thread_type] = range;
+        for (auto local_thread : threadTypeList) {
+            threadCoreMap[local_thread] = range;
         }
         return threadCoreMap;
     }
 
     CoreId now = range.start;
-    for (auto thread_type : threadTypeList) {
-        if (thread_type != ThreadType::OTHER_THREAD) {
-            threadCoreMap[thread_type] = CoreIdRange{now, now};
+    for (auto local_thread : threadTypeList) {
+        if (local_thread != ThreadType::OTHER_THREAD) {
+            threadCoreMap[local_thread] = CoreIdRange{now, now};
         } else {
             threadCoreMap[ThreadType::OTHER_THREAD] = CoreIdRange{now, range.end};
         }
@@ -244,42 +242,28 @@ ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector<Co
     return threadCoreMap;
 }
 
-bool setThreadAffinityImpl(pthread_t thread, CoreIdRange core_range)
+void SetThreadAffinity(c10::DeviceIndex device_id)
 {
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    for (auto i = core_range.start; i <= core_range.end; i++) {
-        CPU_SET(i, &mask);
-    }
-    if (!pthread_setaffinity_np(thread, sizeof(mask), &mask)) {
-        return true;
-    } else {
-        return false;
+    if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) {
+        return;
     }
-}
 
-CoreIdRange getCoreRange(c10::DeviceIndex device_id, ThreadType type)
-{
     CoreIdRange core_range;
-    if (cpu_affinity_mode == 0 || cpu_affinity_mode == 1) {
+    if (cpu_affinity_mode == 1) {
         core_range = device_ranges[device_id];
     } else {
         if (device_thread_core_maps.find(device_id) == device_thread_core_maps.end()) {
             device_thread_core_maps.emplace(device_id, getCpuAffinityMap(device_id, device_ranges));
         }
-        core_range = device_thread_core_maps.at(device_id).at(type);
+        core_range = device_thread_core_maps.at(device_id).at(local_thread);
     }
-    return core_range;
-}
 
-void SetThreadAffinity(c10::DeviceIndex device_id)
-{
-    if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) {
-        return;
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    for (auto i = core_range.start; i <= core_range.end; i++) {
+        CPU_SET(i, &mask);
     }
-
-    CoreIdRange core_range = getCoreRange(device_id, local_thread);
-    if (setThreadAffinityImpl(pthread_self(), core_range)) {
+    if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) {
         ASCEND_LOGD("Device %d set %s affinity to %d-%d success.",
                     device_id, threadTypeToNameMap.at(local_thread).c_str(), core_range.start, core_range.end);
     } else {
@@ -296,10 +280,7 @@ void SetThreadAffinity(ThreadType type)
     int device_index;
     NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&device_index));
     c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
-    local_thread = type;
-    if (local_thread == ThreadType::MAIN_THREAD) {
-        start_main_thread_bind = true;
-    }
+    SetThreadType(type);
     SetThreadAffinity(device);
 }
 
@@ -308,55 +289,20 @@ void SetThreadAffinity(int core_start, int core_end)
     if (!needToSetThreadAffinity()) {
         return;
     }
-
     static int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
-    CoreIdRange core_range;
-    core_range.start = static_cast<CoreId>(std::min(core_start, core_nums));
-    core_range.end = static_cast<CoreId>(std::min(core_end, core_nums));
+    core_start = std::min(core_start, core_nums);
+    core_end = std::min(core_end, core_nums);
     local_thread = ThreadType::USER_THREAD;
 
-    if (setThreadAffinityImpl(pthread_self(), core_range)) {
-        ASCEND_LOGD("Set thread affinity to user-defined range %d-%d success.", core_range.start, core_range.end);
-    } else {
-        ASCEND_LOGE("Set thread affinity to user-defined range %d-%d failed.", core_range.start, core_range.end);
-    }
-}
-
-void SetMainThread()
-{
-    main_thread = pthread_self();
-}
-
-bool NeedMainThreadBind()
-{
-    return start_main_thread_bind && (local_thread == ThreadType::MAIN_THREAD);
-}
-
-void StartMainThreadBind(c10::DeviceIndex device_id)
-{
-    if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) {
-        return;
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    for (auto i = core_start; i <= core_end; i++) {
+        CPU_SET(i, &mask);
     }
-
-    static thread_local bool seted = false;
-    if (!seted) {
-        seted = true;
-        if (syscall(SYS_gettid) != getpid()) {
-            start_main_thread_bind = true;
-
-            SetThreadAffinity(device_id);
-
-            CoreIdRange core_range = getCoreRange(device_id, ThreadType::MAIN_THREAD);
-            if (setThreadAffinityImpl(main_thread, core_range)) {
-                ASCEND_LOGD("Device %d set %s affinity to %d-%d success.",
-                            device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(),
-                            core_range.start, core_range.end);
-            } else {
-                ASCEND_LOGE("Device %d set %s affinity to %d-%d failed.",
-                            device_id, threadTypeToNameMap.at(ThreadType::MAIN_THREAD).c_str(),
-                            core_range.start, core_range.end);
-            }
-        }
+    if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) {
+        ASCEND_LOGD("Set %s affinity to %d-%d success.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end);
+    } else {
+        ASCEND_LOGE("Set %s affinity to %d-%d failed.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end);
     }
 }
 
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h
index e850a47b67..0ec3c4d995 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.h
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.h
@@ -20,12 +20,9 @@ enum ThreadType {
 };
 
 void SetThreadType(ThreadType type);
+
 void SetThreadAffinity(c10::DeviceIndex device);
 void SetThreadAffinity(ThreadType type);
 void SetThreadAffinity(int core_start, int core_end);
 
-void SetMainThread();
-bool NeedMainThreadBind();
-void StartMainThreadBind(c10::DeviceIndex device_id);
-
 } // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp
index 3d146783f0..91b301e043 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.cpp
+++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp
@@ -103,10 +103,7 @@ aclError SetDevice(c10::DeviceIndex device)
     if (local_device == device) {
         return ACL_ERROR_NONE;
     }
-
-    if (c10_npu::NeedMainThreadBind()) {
-        c10_npu::SetThreadAffinity(device);
-    }
+    c10_npu::SetThreadAffinity(device);
 
     aclError err = aclrtSetDevice(device);
     if (err == ACL_ERROR_NONE) {
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 579514ab37..8d8a8da6a6 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -707,8 +707,6 @@ bool Repository::CheckInit() const
 void StartConsume(Repository *repo, c10::DeviceIndex device_id)
 {
     SetThreadType(ThreadType::ACL_THREAD);
-    SetThreadAffinity(device_id);
-
     aclError ret = c10_npu::SetDevice(device_id);
     if (ret != 0) {
         C10_NPU_SHOW_ERR_MSG();
diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
index 739056a7c4..961022b302 100644
--- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
+++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
@@ -52,7 +52,6 @@ void NPUGuardImpl::setDevice(c10::Device d) const
 
 void NPUGuardImpl::uncheckedSetDevice(c10::Device d) const noexcept
 {
-    c10_npu::StartMainThreadBind(d.index());
     NPU_CHECK_WARN(c10_npu::SetDevice(d.index()));
 }
 
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index d705c890ed..b3bf8185d4 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -190,8 +190,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
 
     lazy_fn_.clear();
 
-    SetMainThread();
-
     init_flag_ = true;
     ASCEND_LOGD("Npu sys ctrl initialize successfully.");
 
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 3403c954be..74df9e25bf 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -1718,7 +1718,6 @@ void ProcessGroupHCCL::workCleanupLoop()
             try {
                 if (needSetDevice) {
                     c10::DeviceIndex device = static_cast<int>(work.devices_[0].index());
-                    c10_npu::SetThreadAffinity(device);
                     NPU_CHECK_ERROR(c10_npu::SetDevice(device));
                     deviceId_ = static_cast<int>(work.devices_[0].index());
                     needSetDevice = false;
diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp
index 5f51f9f0a5..8d12df0a31 100644
--- a/torch_npu/csrc/framework/LazyInitAclops.cpp
+++ b/torch_npu/csrc/framework/LazyInitAclops.cpp
@@ -4,6 +4,7 @@
 
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/NpuVariables.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
@@ -157,6 +158,8 @@ void SetPrecisionMode()
 
 void LazyInitAclopsCore()
 {
+    c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
+
 #ifndef BUILD_LIBTORCH
     PyThreadState *gilState = nullptr;
     if (PyGILState_Check()) {
@@ -172,6 +175,8 @@ void LazyInitAclopsCore()
         PyEval_RestoreThread(gilState);
     }
 #endif
+
+    c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
 }
 
 void LazyInitAclops()
@@ -193,10 +198,14 @@ void LazyInitAclops()
 
 void InitAclopsCore()
 {
+    SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
+
     SetPrecisionMode();
     MakeCompileCacheDirAndSetOption();
     GetAndSetDefaultJitCompileByAcl();
     SetHF32DefaultValue();
+
+    SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
 }
 
 void InitAclops()
diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp
index 577f8ef58b..d14bb46ae1 100644
--- a/torch_npu/csrc/framework/interface/EnvVariables.cpp
+++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp
@@ -1,5 +1,6 @@
 #include <climits>
 #include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 
 #include "third_party/acl/inc/acl/acl_mdl.h"
 #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h"
@@ -47,6 +48,8 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) {
   aclmdlSetDump(val.c_str());
 })
 
+static bool acl_op_has_init = false;
+
 REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable")
 REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner)
 REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
@@ -58,7 +61,14 @@ REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
                     "Jit compile set is disabled! If you want to set, ",
                     "please change the environment variable ACL_OP_INIT_MODE to 0 or 1.",
                     PTA_ERROR(ErrCode::NOT_SUPPORT));
+        if (!acl_op_has_init) {
+            c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
+        }
         NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
+        if (!acl_op_has_init) {
+            c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
+            acl_op_has_init = true;
+        }
     }
     SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false);
 })
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 9282737cbf..1f06e2d2f7 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -1582,15 +1582,6 @@ PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args)
     } else {
         c10_npu::SetThreadAffinity(core_start, core_end);
     }
-
-    Py_RETURN_NONE;
-    END_HANDLE_TH_ERRORS
-}
-
-PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs)
-{
-    HANDLE_TH_ERRORS
-    c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
     Py_RETURN_NONE;
     END_HANDLE_TH_ERRORS
 }
@@ -1719,7 +1710,6 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr},
     {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr},
     {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr},
-    {"_npu_reset_thread_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr},
     {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr},
     {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr},
     {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr},
diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py
index 200325e821..29adb686a3 100644
--- a/torch_npu/utils/__init__.py
+++ b/torch_npu/utils/__init__.py
@@ -1,5 +1,5 @@
 __all__ = ["save_async", "npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter",
-           "set_thread_affinity", "reset_thread_affinity"]
+           "set_thread_affinity"]
 
 from torch_npu import _C
 from ._module import _apply_module_patch
@@ -19,7 +19,6 @@ from .clip_grad_norm_ import _apply_clip_grad_norm_patch
 from ._step import add_perf_dump_patch
 from .flops_count import _FlopsCounter as FlopsCounter
 from .affinity import _set_thread_affinity as set_thread_affinity
-from .affinity import _reset_thread_affinity as reset_thread_affinity
 
 
 # init flopcount
diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py
index 313736c3a1..fbe408739f 100644
--- a/torch_npu/utils/_module.py
+++ b/torch_npu/utils/_module.py
@@ -30,6 +30,8 @@ from torch_npu.utils.syncbatchnorm import SyncBatchNorm as sync_batch_norm
 from torch_npu.utils._error_code import ErrCode, pta_error
 
 origin_mpdl_iter_init = _MultiProcessingDataLoaderIter.__init__
+origin_worker_loop = worker._worker_loop
+origin_pin_memory_loop = pin_memory._pin_memory_loop
 
 CONV3D_SUPPORT_FP32_SOC_PREFIX = ["Ascend910B", "Ascend910_93"]
 
@@ -368,9 +370,17 @@ def _mpdl_iter_init(self, *args, **kwargs):
         torch_npu.npu.synchronize()
     except Exception as e:
         print(e)
-    torch_npu._C._npu_set_thread_affinity(-1, -1)
     origin_mpdl_iter_init(self, *args, **kwargs)
-    torch_npu._C._npu_reset_thread_affinity()
+
+
+def _npu_worker_loop(*args, **kwargs):
+    torch_npu._C._npu_set_thread_affinity(-1, -1)
+    origin_worker_loop(*args, **kwargs)
+
+
+def _npu_pin_memory_loop(*args, **kwargs):
+    torch_npu._C._npu_set_thread_affinity(-1, -1)
+    origin_pin_memory_loop(*args, **kwargs)
 
 
 def _parallel_apply(
@@ -523,3 +533,5 @@ def _apply_module_patch():
     torch.nn.parallel.DataParallel.parallel_apply = npu_parallel_apply
     torch.nn.parallel.data_parallel = npu_data_parallel
     torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init
+    torch.utils.data._utils.worker._worker_loop = _npu_worker_loop
+    torch.utils.data._utils.pin_memory._pin_memory_loop = _npu_pin_memory_loop
diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py
index 37973f5bc7..7728736baa 100644
--- a/torch_npu/utils/affinity.py
+++ b/torch_npu/utils/affinity.py
@@ -14,8 +14,4 @@ def _set_thread_affinity(core_range: List[int] = None):
             raise ValueError("Core range should be nonnegative." + pta_error(ErrCode.PARAM))
         torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1])
     else:
-        raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM))
-
-
-def _reset_thread_affinity():
-    torch_npu._C._npu_reset_thread_affinity()
\ No newline at end of file
+        raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM))
\ No newline at end of file
-- 
Gitee


From 910269319fe58e7d198b26e77ced7fd235672476 Mon Sep 17 00:00:00 2001
From: chen_liqing <jiangyiwen5@huawei.com>
Date: Tue, 12 Aug 2025 14:41:09 +0800
Subject: [PATCH 3/7] Revert "!21067 Lazy getThreadAffinityInfo()"

This reverts commit 679469d1571865ca604c787ee60e1c4f85e83ff6.
---
 torch_npu/__init__.py                         |  4 ++-
 .../csrc/core/npu/NPUAffinityController.cpp   | 26 +++++--------------
 .../csrc/core/npu/NPUAffinityController.h     |  1 +
 torch_npu/csrc/npu/Module.cpp                 |  9 +++++++
 torch_npu/utils/affinity.py                   |  2 +-
 5 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index ff0450375f..2b8fc18809 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -272,4 +272,6 @@ if 'TORCH_NPU_SANITIZER' in os.environ:
 if hasattr(sys, 'ps1'):
     os.environ["TASK_QUEUE_ENABLE"] = '0'
     warnings.warn("On the interactive interface, the value of TASK_QUEUE_ENABLE is set to 0 by default. \
-                     Do not set it to 1 to prevent some unknown errors")
\ No newline at end of file
+                     Do not set it to 1 to prevent some unknown errors")
+
+torch_npu._C._npu_get_thread_affinity()
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index a331439d9f..c2c6e1baee 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -12,6 +12,7 @@
 
 namespace c10_npu {
 
+static bool has_set_affinity = false;
 static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD;
 
 using ThreadCoreMap = std::unordered_map<ThreadType, CoreIdRange>;
@@ -158,32 +159,23 @@ void printCoreRanges(const uint32_t mode, const std::vector<CoreIdRange> &ranges
     ASCEND_LOGD("Read CPU affinity config: %s", oss.str().c_str());
 }
 
-bool getThreadAffinityInfo()
+void GetThreadAffinityInfo()
 {
     parseCPUAffinityConf(cpu_affinity_mode, device_ranges);
     printCoreRanges(cpu_affinity_mode, device_ranges);
 
-    if (cpu_affinity_mode == 0) {
-        return false;
-    }
-
     cpu_set_t mask;
     pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask);
     for (auto &range : device_ranges) {
         for (unsigned int i = range.start; i < range.end; i++) {
             if (!CPU_ISSET(i, &mask)) {
                 ASCEND_LOGW("Thread affinity is already set.");
-                return false;
+                has_set_affinity = true;
+                return;
             }
         }
     }
-    return true;
-}
-
-inline bool needToSetThreadAffinity()
-{
-    static bool need_to_set_affinity = getThreadAffinityInfo();
-    return need_to_set_affinity;
+    has_set_affinity = false;
 }
 
 void SetThreadType(ThreadType type)
@@ -244,7 +236,7 @@ ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector<Co
 
 void SetThreadAffinity(c10::DeviceIndex device_id)
 {
-    if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) {
+    if (has_set_affinity || cpu_affinity_mode == 0 || local_thread == ThreadType::USER_THREAD) {
         return;
     }
 
@@ -274,9 +266,6 @@ void SetThreadAffinity(c10::DeviceIndex device_id)
 
 void SetThreadAffinity(ThreadType type)
 {
-    if (!needToSetThreadAffinity()) {
-        return;
-    }
     int device_index;
     NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&device_index));
     c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
@@ -286,9 +275,6 @@ void SetThreadAffinity(ThreadType type)
 
 void SetThreadAffinity(int core_start, int core_end)
 {
-    if (!needToSetThreadAffinity()) {
-        return;
-    }
     static int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
     core_start = std::min(core_start, core_nums);
     core_end = std::min(core_end, core_nums);
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h
index 0ec3c4d995..f4162517a7 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.h
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.h
@@ -19,6 +19,7 @@ enum ThreadType {
     USER_THREAD = 5,        // Thread responsible for user.
 };
 
+void GetThreadAffinityInfo();
 void SetThreadType(ThreadType type);
 
 void SetThreadAffinity(c10::DeviceIndex device);
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 1f06e2d2f7..a85d9a5366 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -1568,6 +1568,14 @@ PyObject* THNPModule_npu_get_silent_check_version(PyObject* self, PyObject* noar
     END_HANDLE_TH_ERRORS
 }
 
+PyObject* THNPModule_npu_get_thread_affinity(PyObject* self, PyObject* noargs)
+{
+    HANDLE_TH_ERRORS
+    c10_npu::GetThreadAffinityInfo();
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
 PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args)
 {
     HANDLE_TH_ERRORS
@@ -1709,6 +1717,7 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_set_call_state", (PyCFunction)THNPModule_npu_set_call_state, METH_O, nullptr},
     {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr},
     {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr},
+    {"_npu_get_thread_affinity", (PyCFunction)THNPModule_npu_get_thread_affinity, METH_NOARGS, nullptr},
     {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr},
     {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr},
     {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr},
diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py
index 7728736baa..bfa16f45dd 100644
--- a/torch_npu/utils/affinity.py
+++ b/torch_npu/utils/affinity.py
@@ -11,7 +11,7 @@ def _set_thread_affinity(core_range: List[int] = None):
         torch_npu._C._npu_set_thread_affinity(-1, -1)
     elif (len(core_range) == 2):
         if core_range[0] < 0 or core_range[1] < 0:
-            raise ValueError("Core range should be nonnegative." + pta_error(ErrCode.PARAM))
+            raise ValueError("Expected core core_range should be nonnegative." + pta_error(ErrCode.PARAM))
         torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1])
     else:
         raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM))
\ No newline at end of file
-- 
Gitee


From 1addf0eb7018a9445e32b166aaffe1b6253f300d Mon Sep 17 00:00:00 2001
From: chen_liqing <jiangyiwen5@huawei.com>
Date: Tue, 12 Aug 2025 14:41:21 +0800
Subject: [PATCH 4/7] Revert "!21028 Change thread affinity while aclop init"

This reverts commit 077eaa6d09e9dd7e3d78f172ddd79425a2a26fd8.
---
 torch_npu/csrc/core/npu/NPUAffinityController.cpp   |  9 ---------
 torch_npu/csrc/core/npu/NPUAffinityController.h     |  2 --
 torch_npu/csrc/framework/LazyInitAclops.cpp         |  9 ---------
 torch_npu/csrc/framework/interface/EnvVariables.cpp | 10 ----------
 torch_npu/csrc/npu/Module.cpp                       |  6 +++++-
 5 files changed, 5 insertions(+), 31 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index c2c6e1baee..6f339e9177 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -264,15 +264,6 @@ void SetThreadAffinity(c10::DeviceIndex device_id)
     }
 }
 
-void SetThreadAffinity(ThreadType type)
-{
-    int device_index;
-    NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&device_index));
-    c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
-    SetThreadType(type);
-    SetThreadAffinity(device);
-}
-
 void SetThreadAffinity(int core_start, int core_end)
 {
     static int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h
index f4162517a7..6c15e8c20e 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.h
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.h
@@ -21,9 +21,7 @@ enum ThreadType {
 
 void GetThreadAffinityInfo();
 void SetThreadType(ThreadType type);
-
 void SetThreadAffinity(c10::DeviceIndex device);
-void SetThreadAffinity(ThreadType type);
 void SetThreadAffinity(int core_start, int core_end);
 
 } // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp
index 8d12df0a31..5f51f9f0a5 100644
--- a/torch_npu/csrc/framework/LazyInitAclops.cpp
+++ b/torch_npu/csrc/framework/LazyInitAclops.cpp
@@ -4,7 +4,6 @@
 
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/NpuVariables.h"
-#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
@@ -158,8 +157,6 @@ void SetPrecisionMode()
 
 void LazyInitAclopsCore()
 {
-    c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
-
 #ifndef BUILD_LIBTORCH
     PyThreadState *gilState = nullptr;
     if (PyGILState_Check()) {
@@ -175,8 +172,6 @@ void LazyInitAclopsCore()
         PyEval_RestoreThread(gilState);
     }
 #endif
-
-    c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
 }
 
 void LazyInitAclops()
@@ -198,14 +193,10 @@ void LazyInitAclops()
 
 void InitAclopsCore()
 {
-    SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
-
     SetPrecisionMode();
     MakeCompileCacheDirAndSetOption();
     GetAndSetDefaultJitCompileByAcl();
     SetHF32DefaultValue();
-
-    SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
 }
 
 void InitAclops()
diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp
index d14bb46ae1..577f8ef58b 100644
--- a/torch_npu/csrc/framework/interface/EnvVariables.cpp
+++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp
@@ -1,6 +1,5 @@
 #include <climits>
 #include "torch_npu/csrc/core/npu/NPUException.h"
-#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 
 #include "third_party/acl/inc/acl/acl_mdl.h"
 #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h"
@@ -48,8 +47,6 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) {
   aclmdlSetDump(val.c_str());
 })
 
-static bool acl_op_has_init = false;
-
 REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable")
 REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner)
 REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
@@ -61,14 +58,7 @@ REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
                     "Jit compile set is disabled! If you want to set, ",
                     "please change the environment variable ACL_OP_INIT_MODE to 0 or 1.",
                     PTA_ERROR(ErrCode::NOT_SUPPORT));
-        if (!acl_op_has_init) {
-            c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
-        }
         NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
-        if (!acl_op_has_init) {
-            c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
-            acl_op_has_init = true;
-        }
     }
     SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false);
 })
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index a85d9a5366..f3563d1bdf 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -1586,7 +1586,11 @@ PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args)
     }
 
     if (core_start == -1) {
-        c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
+        int device_index;
+        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index));
+        c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
+        c10_npu::SetThreadType(c10_npu::ThreadType::OTHER_THREAD);
+        c10_npu::SetThreadAffinity(device);
     } else {
         c10_npu::SetThreadAffinity(core_start, core_end);
     }
-- 
Gitee


From 80e7beaf8a4463a459471bb8342123d10f85be63 Mon Sep 17 00:00:00 2001
From: chen_liqing <jiangyiwen5@huawei.com>
Date: Tue, 12 Aug 2025 14:43:48 +0800
Subject: [PATCH 5/7] Revert "!20142 CPU Affinity Optimization"

This reverts commit 65acf094ba8d63f51a87b2d5d95d5b208a4c364e.
---
 test/torch_npu_schema.json                    |   3 -
 torch_npu/__init__.py                         |   4 +-
 .../csrc/core/npu/NPUAffinityController.cpp   | 480 +++++++++---------
 .../csrc/core/npu/NPUAffinityController.h     |  54 +-
 torch_npu/csrc/core/npu/NPUFunctions.cpp      |   2 -
 torch_npu/csrc/core/npu/NPUQueue.cpp          |   6 +-
 torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp |   1 +
 .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp   |   4 +
 .../csrc/distributed/ProcessGroupHCCL.cpp     |   4 +-
 torch_npu/csrc/npu/Module.cpp                 |  32 +-
 torch_npu/utils/__init__.py                   |   6 +-
 torch_npu/utils/_module.py                    |  19 +-
 torch_npu/utils/affinity.py                   |  17 -
 13 files changed, 305 insertions(+), 327 deletions(-)
 delete mode 100644 torch_npu/utils/affinity.py

diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index a273a3d9a6..771e4ec984 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -2786,9 +2786,6 @@
   "torch_npu.npu_all_gather_base_mm": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.utils.set_thread_affinity": {
-    "signature": "(core_range: List[int] = None)"
-  },
   "torch_npu.dynamo.torchair.scope.npu_stream_switch": {
     "signature": "(stream_tag: str, stream_priority: int = 0)"
   },
diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 2b8fc18809..ff0450375f 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -272,6 +272,4 @@ if 'TORCH_NPU_SANITIZER' in os.environ:
 if hasattr(sys, 'ps1'):
     os.environ["TASK_QUEUE_ENABLE"] = '0'
     warnings.warn("On the interactive interface, the value of TASK_QUEUE_ENABLE is set to 0 by default. \
-                     Do not set it to 1 to prevent some unknown errors")
-
-torch_npu._C._npu_get_thread_affinity()
\ No newline at end of file
+                     Do not set it to 1 to prevent some unknown errors")
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index 6f339e9177..cb7d7a928e 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -1,286 +1,294 @@
+
 #include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
-#include "torch_npu/csrc/core/npu/GetAffinityCPUInfo.h"
-#include "torch_npu/csrc/core/npu/NpuVariables.h"
 
 #include <pthread.h>
 #include <unistd.h>
 #include <sys/syscall.h>
+#include <cstdio>
 #include <sys/prctl.h>
 #include <string>
 #include <unordered_map>
+#include <cctype>
+#include <algorithm>
 
 namespace c10_npu {
 
-static bool has_set_affinity = false;
-static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD;
-
-using ThreadCoreMap = std::unordered_map<ThreadType, CoreIdRange>;
-
-static uint32_t cpu_affinity_mode;
-static std::vector<CoreIdRange> device_ranges;
-static std::unordered_map<c10::DeviceIndex, ThreadCoreMap> device_thread_core_maps;
-
-const std::initializer_list<ThreadType> threadTypeList = {
-    MAIN_THREAD, ACL_THREAD, RELEASE_THREAD, WATCHDOG_THREAD, OTHER_THREAD};
-
-const std::unordered_map<ThreadType, std::string> threadTypeToNameMap = {
-    {MAIN_THREAD,       "main_thread"},
-    {ACL_THREAD,        "acl_thread"},
-    {RELEASE_THREAD,    "release_thread"},
-    {WATCHDOG_THREAD,   "hccl_watchdog_t"},
-    {OTHER_THREAD,      "other_thread"},
-    {USER_THREAD,       "user_thread"}};
-
-CoreIdRange getCPUDefaultRange(c10::DeviceIndex device_id)
-{
-    static int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
-    int device_nums = device_count_ensure_non_zero();
-    int block_size = (core_nums > 0 && device_nums > 0) ? core_nums / device_nums : 0;
-    return CoreIdRange{static_cast<CoreId>(device_id * block_size),
-                       static_cast<CoreId>((device_id + 1) * block_size - 1)};
-}
-
-inline bool isAllDigits(const std::string &str)
-{
-    if (str.empty()) {
+    static pthread_t mainthread_tid;
+    static bool has_set_affinity = false;
+
+    const std::unordered_map<ThreadType, std::string> threadTypeToNameMap = {
+        {releaseThread, "release_thread"},
+        {aclThread, "acl_thread"},
+        {mainThread, "main_thread"},
+        {hcclCommWatchdogThread, "hcclComm_watchd"}, // thread name no more than 15 chars
+        {backwardThread, "backward_thread"}};
+
+    const std::unordered_map<std::string, ThreadType> threadNameToTypeMap = {
+        {"release_thread", releaseThread},
+        {"acl_thread", aclThread},
+        {"main_thread", mainThread},
+        {"hcclComm_watchd", hcclCommWatchdogThread},
+        {"backward_thread", backwardThread}};
+
+    inline bool has_set_pthread_affinity()
+    {
+        unsigned int core_nums = static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_ONLN));
+
+        cpu_set_t mask;
+        pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask);
+        for (unsigned int i = 0; i < core_nums; i++) {
+            if (!CPU_ISSET(i, &mask)) {
+                return true;
+            }
+        }
         return false;
     }
-    return std::all_of(str.begin(), str.end(), [](unsigned char c) {
-        return std::isdigit(c);
-    });
-}
-
-void parseCPUAffinityConf(uint32_t &mode, std::vector<CoreIdRange> &ranges)
-{
-    // init
-    int device_nums = device_count_ensure_non_zero();
-    ranges.clear();
-    ranges.resize(device_nums);
-    for (int i = 0; i < device_nums; ++i) {
-        ranges[i] = getCPUDefaultRange(i);
-    }
-    mode = 0;
 
-    const char *input = c10_npu::option::OptionsManager::GetCpuAffinityConf();
-    if (input == nullptr || strlen(input) == 0) {
-        return;
+    void GetAffinityInfo()
+    {
+        mainthread_tid = pthread_self();
+        has_set_affinity = has_set_pthread_affinity();
     }
 
-    std::string inputStr(input);
-    std::istringstream stream(inputStr);
-    std::string option;
-
-    std::regex pattern("npu_affine:(\\d)");
-    std::smatch match;
-    if (std::regex_search(inputStr, match, pattern)) {
-        int isAffinity = std::stoi(match[1].str());
-        if (isAffinity != 0) {
-            for (int i = 0; i < device_nums; i++) {
-                CoreIdRange getRange = GetAssignAffinityCPU(i);
-                if (getRange.start == 0 && getRange.end == 0) {
-                    break;
-                }
-                ranges[i] = getRange;
+    ThreadType getCurrentThreadType()
+    {
+        char thread_name[16];
+
+        if (prctl(PR_GET_NAME, thread_name, 0, 0, 0) == 0) {
+            std::string name(thread_name);
+
+            auto it = threadNameToTypeMap.find(name);
+            if (it != threadNameToTypeMap.end()) {
+                return it->second;
             }
         }
+        return ThreadType::unknownThread;
     }
 
-    // Handle cases where only `mode` is provided, or `mode:` without value
-    if (isAllDigits(inputStr)) {
-        mode = static_cast<uint32_t>(std::stoi(inputStr));
-        return; // Return directly, `mode` has already been processed
+    aclError SetThreadAffinity(coreIdRange core_range, pthread_t thread)
+    {
+        cpu_set_t mask;
+        CPU_ZERO(&mask);
+
+        for (auto i = core_range.start; i <= core_range.end; i++) {
+            CPU_SET(i, &mask);
+        }
+        if (!pthread_setaffinity_np(thread, sizeof(mask), &mask)) {
+            ASCEND_LOGD("Set Thread Affinity to %d-%d", core_range.start, core_range.end);
+            return ACL_ERROR_NONE;
+        }
+        return ACL_ERROR_FEATURE_UNSUPPORTED;
+    }
+
+    coreIdRange GetCPUDefaultRange(c10::DeviceIndex device_id)
+    {
+        int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
+        int device_nums = device_count_ensure_non_zero();
+        int block_size = (core_nums > 0 && device_nums > 0) ? (core_nums + device_nums - 1) / device_nums : 0;
+        return coreIdRange{static_cast<unsigned int>(device_id * block_size),
+                           static_cast<coreId>(std::min((device_id + 1) * block_size, core_nums) - 1)};
     }
 
-    // Parse each option
-    while (std::getline(stream, option, ',')) {
-        // Split `option` based on colon
-        size_t colonPos = option.find(':');
-        if (colonPos != std::string::npos) {
-            std::string key = option.substr(0, colonPos);
-            std::string value = option.substr(colonPos + 1);
-
-            // Process `mode`
-            if (key == "mode") {
-                if (isAllDigits(value)) {
-                    mode = static_cast<uint32_t>(std::stoi(value));
+
+    std::string GetAffinityMapAsString(const std::unordered_map<ThreadType, coreIdRange> &threadToCoreidMap, c10::DeviceIndex device_id)
+    {
+        std::ostringstream oss;
+        oss << "threadToCoreidMap plan to bind device " << static_cast<unsigned int>(device_id) << " to "
+            << " [" << threadToCoreidMap.at(unknownThread).start << "," << threadToCoreidMap.at(unknownThread).end << "]、"
+            << " [" << threadToCoreidMap.at(mainThread).start << "," << threadToCoreidMap.at(mainThread).end << "]、"
+            << " [" << threadToCoreidMap.at(backwardThread).start << "," << threadToCoreidMap.at(backwardThread).end << "]、"
+            << " [" << threadToCoreidMap.at(aclThread).start << "," << threadToCoreidMap.at(aclThread).end << "]、"
+            << " [" << threadToCoreidMap.at(releaseThread).start << "," << threadToCoreidMap.at(releaseThread).end << "]、"
+            << " [" << threadToCoreidMap.at(hcclCommWatchdogThread).start << "," << threadToCoreidMap.at(hcclCommWatchdogThread).end << "]";
+
+        return oss.str();
+    }
+
+    std::unordered_map<ThreadType, coreIdRange> GetCpuAffinityMap(c10::DeviceIndex device_id)
+    {
+        std::unordered_map<ThreadType, coreIdRange> threadToCoreidMap;
+        std::initializer_list<ThreadType> thread_types = {unknownThread, mainThread, backwardThread, aclThread,
+                                                          releaseThread, hcclCommWatchdogThread};
+
+        coreIdRange current_core_range = GetCPUDefaultRange(device_id);
+        coreId offset = current_core_range.start;
+
+        // calculate env2 default map
+        coreId core_nums = current_core_range.end - current_core_range.start;
+        if (core_nums < thread_types.size()) {
+            ASCEND_LOGW("Available core numbers (%d) are insufficient for all %zu thread types. Binding available cores to all threads.",
+                        core_nums, thread_types.size());
+            for (auto thread_type : thread_types) {
+                threadToCoreidMap[thread_type] = current_core_range;
+            }
+        } else {
+            size_t remaining_type_count = thread_types.size() - 1;
+            int i = 0;
+            for (auto thread_type : thread_types) {
+                if (thread_type == ThreadType::unknownThread) {
+                    threadToCoreidMap[ThreadType::unknownThread] = coreIdRange{current_core_range.start + remaining_type_count, current_core_range.end};
                 } else {
-                    ASCEND_LOGW("mode is %s, should be all digits", value.c_str());
-                }
-            } else if (key.rfind("npu", 0) == 0) {
-                // Handle NPU core binding range
-                // The key is like 'npu:0', so skip first 3 chars.
-                if (isAllDigits(key.substr(3))) {
-                    int device_id = std::stoi(key.substr(3)); // Parse NPU device ID
-                    if (device_id < device_nums) {
-                        size_t dashPos = value.find('-');
-                        if (dashPos != std::string::npos) {
-                            std::string startStr = value.substr(0, dashPos);
-                            std::string endStr = value.substr(dashPos + 1);
-                            if (isAllDigits(startStr) && isAllDigits(endStr)) {
-                                CoreId start = static_cast<CoreId>(std::stoi(startStr));
-                                CoreId end = static_cast<CoreId>(std::stoi(endStr));
-                                ranges[device_id] = {start, end};
-                            } else {
-                                ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str());
-                            }
-                        } else {
-                            if (isAllDigits(value)) {
-                                CoreId singleCore = static_cast<CoreId>(std::stoi(value));
-                                ranges[device_id] = {singleCore, singleCore};
-                            } else {
-                                ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str());
-                            }
-                        }
-                    }
+                    threadToCoreidMap[thread_type] = coreIdRange{offset + i, offset + (i++)};
                 }
             }
-        } else if (isAllDigits(option)) {
-            // If no colon and the value is a number, use it directly as `mode`
-            mode = static_cast<uint32_t>(std::stoi(option));
         }
-    }
-}
 
-void printCoreRanges(const uint32_t mode, const std::vector<CoreIdRange> &ranges)
-{
-    std::ostringstream oss;
-    oss << "Mode: " << mode << ". Core range for each device ID: ";
+        ASCEND_LOGD("Thread affinity map for device %d: %s", device_id, GetAffinityMapAsString(threadToCoreidMap, device_id).c_str());
 
-    for (size_t i = 0; i < ranges.size(); ++i) {
-        oss << "Device " << i << ": [" << ranges[i].start << "," << ranges[i].end << "]";
-        if (i != ranges.size() - 1) {
-            oss << "; ";
-        } else {
-            oss << ".";
-        }
+        return threadToCoreidMap;
     }
 
-    ASCEND_LOGD("Read CPU affinity config: %s", oss.str().c_str());
-}
+    aclError SetThreadAffinity(c10::DeviceIndex device_id)
+    {
+        return SetThreadAffinity(device_id, getCurrentThreadType());
+    }
 
-void GetThreadAffinityInfo()
-{
-    parseCPUAffinityConf(cpu_affinity_mode, device_ranges);
-    printCoreRanges(cpu_affinity_mode, device_ranges);
+    void printCoreRanges(const std::vector<coreIdRange> &ranges, uint32_t mode)
+    {
+        std::ostringstream oss;
+        oss << "Mode: " << mode << " ";
 
-    cpu_set_t mask;
-    pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask);
-    for (auto &range : device_ranges) {
-        for (unsigned int i = range.start; i < range.end; i++) {
-            if (!CPU_ISSET(i, &mask)) {
-                ASCEND_LOGW("Thread affinity is already set.");
-                has_set_affinity = true;
-                return;
-            }
+        for (size_t i = 0; i < ranges.size(); ++i) {
+            oss << "Device " << i << " Core Range: " << ranges[i].start << " - " << ranges[i].end << " ";
         }
+
+        ASCEND_LOGD("Core ranges: %s", oss.str().c_str());
     }
-    has_set_affinity = false;
-}
-
-void SetThreadType(ThreadType type)
-{
-    // Called at the start of the thread's execution to avoid frequent triggering of this function.
-    local_thread = type;
-    if (type == ThreadType::OTHER_THREAD || type == ThreadType::MAIN_THREAD) {
-        return;
-    }
-    if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) {
-        ASCEND_LOGW("Set thread name of %s failed!", threadTypeToNameMap.at(type).c_str());
-    }
-}
-
-std::string getAffinityMapAsString(c10::DeviceIndex device_id, const ThreadCoreMap &threadCoreMap)
-{
-    std::ostringstream oss;
-    for (auto local_thread : threadTypeList) {
-        oss << threadTypeToNameMap.at(local_thread) << " : ["
-            << threadCoreMap.at(local_thread).start << ","
-            << threadCoreMap.at(local_thread).end << "]";
-        if (local_thread != OTHER_THREAD) {
-            oss << "; ";
-        } else {
-            oss << ".";
+
+    bool isAllDigits(const std::string &str)
+    {
+        if (str.empty()) {
+            return false;
         }
+        return std::all_of(str.begin(), str.end(), [](unsigned char c) {
+            return std::isdigit(c);
+        });
     }
-    return oss.str();
-}
-
-ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector<CoreIdRange> &device_ranges)
-{
-    ThreadCoreMap threadCoreMap;
-    CoreIdRange range = device_ranges[device_id];
-    unsigned int core_nums = range.end - range.start + 1;
-    if (core_nums < threadTypeList.size()) {
-        ASCEND_LOGW("Device %d available core numbers (%d) are insufficient for all %zu thread types and will bind available cores to all threads.",
-                    device_id, core_nums, threadTypeList.size());
-        for (auto local_thread : threadTypeList) {
-            threadCoreMap[local_thread] = range;
+
+    void parseCPUAffinityConf(uint32_t &mode, std::vector<coreIdRange> &ranges)
+    {
+        const char *input = c10_npu::option::OptionsManager::GetCpuAffinityConf();
+
+        if (input == nullptr || strlen(input) == 0) {
+            mode = 0;
+            return;
         }
-        return threadCoreMap;
-    }
 
-    CoreId now = range.start;
-    for (auto local_thread : threadTypeList) {
-        if (local_thread != ThreadType::OTHER_THREAD) {
-            threadCoreMap[local_thread] = CoreIdRange{now, now};
-        } else {
-            threadCoreMap[ThreadType::OTHER_THREAD] = CoreIdRange{now, range.end};
+        mode = 0;
+        int device_nums = device_count_ensure_non_zero();
+        ranges.clear();
+        ranges.resize(device_nums);
+
+        // init
+        for (int i = 0; i < device_nums; ++i) {
+            ranges[i] = GetCPUDefaultRange(i);
         }
-        now++;
-    }
 
-    ASCEND_LOGD("Device %d thread affinity map: %s", device_id, getAffinityMapAsString(device_id, threadCoreMap).c_str());
-    return threadCoreMap;
-}
+        std::string inputStr(input);
+        std::istringstream stream(inputStr);
+        std::string option;
 
-void SetThreadAffinity(c10::DeviceIndex device_id)
-{
-    if (has_set_affinity || cpu_affinity_mode == 0 || local_thread == ThreadType::USER_THREAD) {
-        return;
-    }
+        // Handle cases where only `mode` is provided, or `mode:` without value
+        if (isAllDigits(inputStr)) {
+            mode = static_cast<uint32_t>(std::stoi(inputStr));
+            return; // Return directly, `mode` has already been processed
+        }
 
-    CoreIdRange core_range;
-    if (cpu_affinity_mode == 1) {
-        core_range = device_ranges[device_id];
-    } else {
-        if (device_thread_core_maps.find(device_id) == device_thread_core_maps.end()) {
-            device_thread_core_maps.emplace(device_id, getCpuAffinityMap(device_id, device_ranges));
+        // Parse each option
+        while (std::getline(stream, option, ',')) {
+            // Split `option` based on colon
+            size_t colonPos = option.find(':');
+            if (colonPos != std::string::npos) {
+                std::string key = option.substr(0, colonPos);
+                std::string value = option.substr(colonPos + 1);
+
+                // Process `mode`
+                if (key == "mode") {
+                    if (isAllDigits(value)) {
+                        mode = static_cast<uint32_t>(std::stoi(value));
+                    } else {
+                        ASCEND_LOGW("mode is %s, should be all digits", value.c_str());
+                    }
+                } else if (key.rfind("npu", 0) == 0) {
+                    // Handle NPU core binding range
+                    if (isAllDigits(key.substr(3))) {
+                        int device_id = std::stoi(key.substr(3)); // Parse NPU device ID
+                        if (device_id < device_nums) {
+                            size_t dashPos = value.find('-');
+                            if (dashPos != std::string::npos) {
+                                std::string startStr = value.substr(0, dashPos);
+                                std::string endStr = value.substr(dashPos + 1);
+                                if (isAllDigits(startStr) && isAllDigits(endStr)) {
+                                    coreId start = static_cast<coreId>(std::stoi(startStr));
+                                    coreId end = static_cast<coreId>(std::stoi(endStr));
+                                    ranges[device_id] = {start, end};
+                                } else {
+                                    ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str());
+                                }
+                            } else {
+                                if (isAllDigits(value)) {
+                                    coreId singleCore = static_cast<coreId>(std::stoi(value));
+                                    ranges[device_id] = {singleCore, singleCore};
+                                } else {
+                                    ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str());
+                                }
+                            }
+                        }
+                    }
+                }
+            } else if (isAllDigits(option)) {
+                // If no colon and the value is a number, use it directly as `mode`
+                mode = static_cast<uint32_t>(std::stoi(option));
+            }
         }
-        core_range = device_thread_core_maps.at(device_id).at(local_thread);
     }
 
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    for (auto i = core_range.start; i <= core_range.end; i++) {
-        CPU_SET(i, &mask);
-    }
-    if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) {
-        ASCEND_LOGD("Device %d set %s affinity to %d-%d success.",
-                    device_id, threadTypeToNameMap.at(local_thread).c_str(), core_range.start, core_range.end);
-    } else {
-        ASCEND_LOGE("Device %d set %s affinity to %d-%d failed.",
-                    device_id, threadTypeToNameMap.at(local_thread).c_str(), core_range.start, core_range.end);
+    aclError SetThreadAffinity(c10::DeviceIndex device_id, ThreadType current_thread_type)
+    {
+        if (has_set_affinity) {
+            ASCEND_LOGW("Thread affinity is already set.");
+            return ACL_ERROR_NONE;
+        }
+        uint32_t bind_conf;
+        std::vector<coreIdRange> ranges;
+        parseCPUAffinityConf(bind_conf, ranges);
+        printCoreRanges(ranges, bind_conf);
+
+        // bind_conf=1, bind cores averagely based on device_id
+        if (bind_conf == 1) {
+            return SetThreadAffinity(ranges[device_id], pthread_self());
+        } else if (bind_conf == 2) {
+            auto thread_core_map = GetCpuAffinityMap(device_id);
+            // Bind the main thread only when the dispatch phase begins (i.e., when ThreadType::backwardThread is set)
+            if (current_thread_type == ThreadType::backwardThread) {
+                SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid);
+            }
+            return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self());
+        } else {
+            ASCEND_LOGD("Thread affinity setting is disabled.");
+        }
+        return ACL_ERROR_NONE;
     }
-}
-
-void SetThreadAffinity(int core_start, int core_end)
-{
-    static int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
-    core_start = std::min(core_start, core_nums);
-    core_end = std::min(core_end, core_nums);
-    local_thread = ThreadType::USER_THREAD;
-
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    for (auto i = core_start; i <= core_end; i++) {
-        CPU_SET(i, &mask);
+
+    void SetBackwardThreadName(c10::DeviceIndex device_id)
+    {
+        static thread_local bool seted = false;
+        if (!seted) {
+            seted = true;
+            if (syscall(SYS_gettid) != getpid()) {
+                SetThreadName(ThreadType::backwardThread);
+                SetThreadAffinity(device_id);
+            }
+        }
     }
-    if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) {
-        ASCEND_LOGD("Set %s affinity to %d-%d success.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end);
-    } else {
-        ASCEND_LOGE("Set %s affinity to %d-%d failed.", threadTypeToNameMap.at(local_thread).c_str(), core_start, core_end);
+
+    void SetThreadName(ThreadType type)
+    {
+        // Ensure this is called at the start of the thread's execution to avoid frequent triggering of this function.
+        if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) {
+            ASCEND_LOGW("set thread name failed!");
+        }
     }
-}
 
-} // namespace c10_npu
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h
index 6c15e8c20e..f2e78b69b6 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.h
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.h
@@ -1,27 +1,35 @@
 #pragma once
-#include <c10/core/Device.h>
+#include "torch_npu/csrc/core/npu/npu_log.h"
 
 namespace c10_npu {
 
-using CoreId = unsigned int;
-struct CoreIdRange {
-    CoreId start;
-    CoreId end;
-};
-
-enum ThreadType {
-    MAIN_THREAD = 0,        // 1st performance hotspot, responsible for operator dispatching.
-    ACL_THREAD = 1,         // 2rd performance hotspot in PTA, responsible for handling the task queue.
-    RELEASE_THREAD = 2,     // Thread responsible for resource release.
-    WATCHDOG_THREAD = 3,    // Thread responsible for HCCL communication monitoring.
-    OTHER_THREAD = 4,       // Mostly refers to threads in PyTorch's motorized sleep thread pool, which
-                            // are not considered in PTA.
-    USER_THREAD = 5,        // Thread responsible for user.
-};
-
-void GetThreadAffinityInfo();
-void SetThreadType(ThreadType type);
-void SetThreadAffinity(c10::DeviceIndex device);
-void SetThreadAffinity(int core_start, int core_end);
-
-} // namespace c10_npu
\ No newline at end of file
+    typedef unsigned int coreId;
+
+    struct coreIdRange {
+        coreId start;
+        coreId end;
+    };
+
+    enum ThreadType {
+        unknownThread = 0, // Mostly refers to threads in PyTorch's motorized sleep thread pool, which are not considered in PTA.
+        mainThread = 1,    // 1st performance hotspot, responsible for operator dispatching during the forward phase.
+        backwardThread = 2,  // 2nd performance hotspot, responsible for operator dispatching during the backward phase.
+        aclThread = 3,     // 3rd performance hotspot in PTA, responsible for handling the task queue.
+        releaseThread = 4, // Thread responsible for resource release.
+        hcclCommWatchdogThread = 5 // Thread responsible for HCCL communication monitoring.
+    };
+
+    aclError SetThreadAffinity(c10::DeviceIndex device);
+    aclError SetThreadAffinity(c10::DeviceIndex device, ThreadType current_thread_type);
+    void SetThreadName(ThreadType type);
+
+    // The main thread of PTA, which is also the main thread of PyTorch, handles multiple phases of tasks
+    // (e.g., first parallel checkpoint data loading, then transitioning to forward training).
+    // Each phase may require different thread affinity settings. Therefore, we record the thread's TID
+    // to adjust its affinity later as needed.
+    void GetAffinityInfo();
+
+    // Set backwardThread Name Once
+    void SetBackwardThreadName(c10::DeviceIndex device_id);
+
+}
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp
index 91b301e043..1ba90030fb 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.cpp
+++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp
@@ -3,7 +3,6 @@
 #include <unordered_map>
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
-#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/sanitizer/NPUTrace.h"
@@ -103,7 +102,6 @@ aclError SetDevice(c10::DeviceIndex device)
     if (local_device == device) {
         return ACL_ERROR_NONE;
     }
-    c10_npu::SetThreadAffinity(device);
 
     aclError err = aclrtSetDevice(device);
     if (err == ACL_ERROR_NONE) {
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 8d8a8da6a6..231e9a2244 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -706,7 +706,9 @@ bool Repository::CheckInit() const
 
 void StartConsume(Repository *repo, c10::DeviceIndex device_id)
 {
-    SetThreadType(ThreadType::ACL_THREAD);
+    SetThreadName(ThreadType::aclThread);
+    SetThreadAffinity(device_id);
+
     aclError ret = c10_npu::SetDevice(device_id);
     if (ret != 0) {
         C10_NPU_SHOW_ERR_MSG();
@@ -818,7 +820,7 @@ void ReleaseQueue::PopFromReleaseQueue()
 
 void StartRelease(ReleaseQueue *releaseQue)
 {
-    SetThreadType(ThreadType::RELEASE_THREAD);
+    SetThreadName(ThreadType::releaseThread);
     SetThreadAffinity(releaseQue->GetDeviceID());
 
     while (releaseQue->GetStatus() != RepoStatus::CAN_EXIT) {
diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
index 961022b302..6ce7a03e03 100644
--- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
+++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
@@ -52,6 +52,7 @@ void NPUGuardImpl::setDevice(c10::Device d) const
 
 void NPUGuardImpl::uncheckedSetDevice(c10::Device d) const noexcept
 {
+    SetBackwardThreadName(d.index());
     NPU_CHECK_WARN(c10_npu::SetDevice(d.index()));
 }
 
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index b3bf8185d4..1870de034c 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -190,6 +190,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
 
     lazy_fn_.clear();
 
+    GetAffinityInfo();
+
+    SetThreadAffinity(device_id_);
+
     init_flag_ = true;
     ASCEND_LOGD("Npu sys ctrl initialize successfully.");
 
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 74df9e25bf..f8f3047927 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -1591,8 +1591,9 @@ void ProcessGroupHCCL::heartbeatMonitor()
 
 void ProcessGroupHCCL::hcclCommWatchdog()
 {
-    c10_npu::SetThreadType(c10_npu::ThreadType::WATCHDOG_THREAD);
     try {
+        c10_npu::SetThreadName(c10_npu::ThreadType::hcclCommWatchdogThread);
+
         VLOG(2) << "[Rank " << rank_ << "] HCCL watchdog thread started!";
         if (monitorThreadEnabled_.load()) {
             hcclHeartbeatMonitorThread_ = std::thread(&ProcessGroupHCCL::heartbeatMonitor, this);
@@ -1718,6 +1719,7 @@ void ProcessGroupHCCL::workCleanupLoop()
             try {
                 if (needSetDevice) {
                     c10::DeviceIndex device = static_cast<int>(work.devices_[0].index());
+                    c10_npu::SetThreadAffinity(device);
                     NPU_CHECK_ERROR(c10_npu::SetDevice(device));
                     deviceId_ = static_cast<int>(work.devices_[0].index());
                     needSetDevice = false;
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index f3563d1bdf..8bf1dab99a 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -1568,32 +1568,24 @@ PyObject* THNPModule_npu_get_silent_check_version(PyObject* self, PyObject* noar
     END_HANDLE_TH_ERRORS
 }
 
-PyObject* THNPModule_npu_get_thread_affinity(PyObject* self, PyObject* noargs)
+PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* noargs)
 {
     HANDLE_TH_ERRORS
-    c10_npu::GetThreadAffinityInfo();
+    int device_index;
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index));
+    c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
+    c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::mainThread);
     Py_RETURN_NONE;
     END_HANDLE_TH_ERRORS
 }
 
-PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args)
+PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs)
 {
     HANDLE_TH_ERRORS
-    int core_start;
-    int core_end;
-    if (!PyArg_ParseTuple(args, "ii", &core_start, &core_end)) {
-        throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE));
-    }
-
-    if (core_start == -1) {
-        int device_index;
-        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index));
-        c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
-        c10_npu::SetThreadType(c10_npu::ThreadType::OTHER_THREAD);
-        c10_npu::SetThreadAffinity(device);
-    } else {
-        c10_npu::SetThreadAffinity(core_start, core_end);
-    }
+    int device_index;
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index));
+    c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
+    c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::unknownThread);
     Py_RETURN_NONE;
     END_HANDLE_TH_ERRORS
 }
@@ -1721,8 +1713,8 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_set_call_state", (PyCFunction)THNPModule_npu_set_call_state, METH_O, nullptr},
     {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr},
     {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr},
-    {"_npu_get_thread_affinity", (PyCFunction)THNPModule_npu_get_thread_affinity, METH_NOARGS, nullptr},
-    {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr},
+    {"_npu_set_threads_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_NOARGS, nullptr},
+    {"_npu_reset_threads_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr},
     {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr},
     {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr},
     {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr},
diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py
index 29adb686a3..8c33a26e4b 100644
--- a/torch_npu/utils/__init__.py
+++ b/torch_npu/utils/__init__.py
@@ -1,6 +1,3 @@
-__all__ = ["save_async", "npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter",
-           "set_thread_affinity"]
-
 from torch_npu import _C
 from ._module import _apply_module_patch
 from .tensor_methods import _add_tensor_methods
@@ -18,7 +15,8 @@ from .utils import _print_error_log, _print_warn_log, _print_info_log, _should_p
 from .clip_grad_norm_ import _apply_clip_grad_norm_patch
 from ._step import add_perf_dump_patch
 from .flops_count import _FlopsCounter as FlopsCounter
-from .affinity import _set_thread_affinity as set_thread_affinity
+
+__all__ = ["save_async", "npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter"]
 
 
 # init flopcount
diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py
index fbe408739f..1e575cafa3 100644
--- a/torch_npu/utils/_module.py
+++ b/torch_npu/utils/_module.py
@@ -17,7 +17,6 @@ from torch.nn.modules.batchnorm import _NormBase, _LazyNormBase
 from torch.nn.modules.module import Module
 from torch.nn.parallel._functions import _streams
 from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter
-from torch.utils.data._utils import worker, pin_memory
 from torch._utils import _get_device_index, _get_all_device_indices, _get_available_device_type, ExceptionWrapper
 from torch.nn.parallel.parallel_apply import get_a_var
 from torch.nn.parallel.scatter_gather import gather, scatter_kwargs
@@ -30,8 +29,6 @@ from torch_npu.utils.syncbatchnorm import SyncBatchNorm as sync_batch_norm
 from torch_npu.utils._error_code import ErrCode, pta_error
 
 origin_mpdl_iter_init = _MultiProcessingDataLoaderIter.__init__
-origin_worker_loop = worker._worker_loop
-origin_pin_memory_loop = pin_memory._pin_memory_loop
 
 CONV3D_SUPPORT_FP32_SOC_PREFIX = ["Ascend910B", "Ascend910_93"]
 
@@ -370,17 +367,9 @@ def _mpdl_iter_init(self, *args, **kwargs):
         torch_npu.npu.synchronize()
     except Exception as e:
         print(e)
+    torch_npu._C._npu_reset_threads_affinity()
     origin_mpdl_iter_init(self, *args, **kwargs)
-
-
-def _npu_worker_loop(*args, **kwargs):
-    torch_npu._C._npu_set_thread_affinity(-1, -1)
-    origin_worker_loop(*args, **kwargs)
-
-
-def _npu_pin_memory_loop(*args, **kwargs):
-    torch_npu._C._npu_set_thread_affinity(-1, -1)
-    origin_pin_memory_loop(*args, **kwargs)
+    torch_npu._C._npu_set_threads_affinity()
 
 
 def _parallel_apply(
@@ -530,8 +519,6 @@ def _apply_module_patch():
     torch.nn.Module.cast_weight = cast_weight
     torch.nn.modules.rnn.LSTM.forward = _lstm_forward
     torch.nn.modules.batchnorm.SyncBatchNorm.forward = _syncbn_forward
+    torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init
     torch.nn.parallel.DataParallel.parallel_apply = npu_parallel_apply
     torch.nn.parallel.data_parallel = npu_data_parallel
-    torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init
-    torch.utils.data._utils.worker._worker_loop = _npu_worker_loop
-    torch.utils.data._utils.pin_memory._pin_memory_loop = _npu_pin_memory_loop
diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py
deleted file mode 100644
index bfa16f45dd..0000000000
--- a/torch_npu/utils/affinity.py
+++ /dev/null
@@ -1,17 +0,0 @@
-__all__ = []
-
-from typing import List
-
-import torch_npu
-from torch_npu.utils._error_code import ErrCode, pta_error
-
-
-def _set_thread_affinity(core_range: List[int] = None):
-    if core_range is None:
-        torch_npu._C._npu_set_thread_affinity(-1, -1)
-    elif (len(core_range) == 2):
-        if core_range[0] < 0 or core_range[1] < 0:
-            raise ValueError("Expected core core_range should be nonnegative." + pta_error(ErrCode.PARAM))
-        torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1])
-    else:
-        raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM))
\ No newline at end of file
-- 
Gitee


From aff043b6bd00b2807eb309f81fa79e34f9874a29 Mon Sep 17 00:00:00 2001
From: chen_liqing <jiangyiwen5@huawei.com>
Date: Tue, 12 Aug 2025 14:53:41 +0800
Subject: [PATCH 6/7] Delete dcmi info

---
 third_party/dcmi/CMakeLists.txt               |   1 -
 third_party/dcmi/inc/dcmi_interface_api.h     |  26 ----
 .../csrc/core/npu/GetAffinityCPUInfo.cpp      | 115 ------------------
 torch_npu/csrc/core/npu/GetAffinityCPUInfo.h  |  10 --
 .../csrc/core/npu/interface/DcmiInterface.cpp |  72 -----------
 .../csrc/core/npu/interface/DcmiInterface.h   |  15 ---
 6 files changed, 239 deletions(-)
 delete mode 100644 third_party/dcmi/CMakeLists.txt
 delete mode 100644 third_party/dcmi/inc/dcmi_interface_api.h
 delete mode 100644 torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
 delete mode 100644 torch_npu/csrc/core/npu/GetAffinityCPUInfo.h
 delete mode 100644 torch_npu/csrc/core/npu/interface/DcmiInterface.cpp
 delete mode 100644 torch_npu/csrc/core/npu/interface/DcmiInterface.h

diff --git a/third_party/dcmi/CMakeLists.txt b/third_party/dcmi/CMakeLists.txt
deleted file mode 100644
index 5a5b6ee02b..0000000000
--- a/third_party/dcmi/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-INSTALL(DIRECTORY inc/ DESTINATION include/third_party/dcmi/inc FILES_MATCHING PATTERN "*.h")
\ No newline at end of file
diff --git a/third_party/dcmi/inc/dcmi_interface_api.h b/third_party/dcmi/inc/dcmi_interface_api.h
deleted file mode 100644
index a55402f30d..0000000000
--- a/third_party/dcmi/inc/dcmi_interface_api.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright: Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
- * Author: huawei
- * Date: 2021-03-17 17:46:08
- * @LastEditors: huawei
- * @LastEditTime: 2022-11-03 11:17:04
- * Description: DCMI API Reference
- */
-
-/***************************************************************************************/
-
-#ifdef __linux
-#define DCMIDLLEXPORT
-#else
-#define DCMIDLLEXPORT _declspec(dllexport)
-#endif
-
-#define TOPO_INFO_MAX_LENTH   32 // topo info max length
-
-DCMIDLLEXPORT int dcmi_init(void);
-
-DCMIDLLEXPORT int dcmi_get_card_num_list(int *card_num, int *card_list, int list_len);  // card_num is the number of device.
-
-DCMIDLLEXPORT int dcmi_get_affinity_cpu_info_by_device_id(int card_id, int device_id, char *affinity_cpu, int *length);  // card_id is the ID of NPU card.
-
-DCMIDLLEXPORT int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id);
diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
deleted file mode 100644
index a88e327560..0000000000
--- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-#include <unordered_map>
-#include "torch_npu/csrc/core/npu/interface/DcmiInterface.h"
-#include "torch_npu/csrc/core/npu/NPUException.h"
-#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
-
-constexpr int NPU_OK = 0;
-
-static int DcmiInit()
-{
-    int ret = c10_npu::dcmi::DcmiInit();
-    if (ret != NPU_OK) {
-        TORCH_CHECK(false, "Failed to init dcmi. ", PTA_ERROR(ErrCode::INTERNAL));
-    }
-    return ret;
-}
-
-std::string GetAffinityCPUBaseInfo(int card_id)
-{
-    int ret = DcmiInit();
-    int device_id = 0;
-    int device_id_max = 0;
-    int mcu_id = 0;
-    int cpu_id = 0;
-    ret = c10_npu::dcmi::DcmiGetDeviceIdInCard(card_id, &device_id_max, &mcu_id, &cpu_id);
-    if (ret != NPU_OK) {
-        TORCH_NPU_WARN_ONCE("dcmi_get_device_id_in_card is not supported. "
-                            "The npu_affine configuration of CPU_AFFINITY_CONF will be disabled.");
-        return "";
-    }
-    device_id = std::max(0, device_id_max - 1);
-    char affinity_cpu[TOPO_INFO_MAX_LENTH] = {0};
-    int length = 0;
-    ret = c10_npu::dcmi::DcmiGetAffinityCpuInfoByDeviceId(card_id, device_id, affinity_cpu, &length);
-    if (ret == NPU_OK) {
-        return affinity_cpu;
-    }
-    TORCH_NPU_WARN_ONCE("dcmi_get_affinity_cpu_info_by_device_id is not supported. "
-                        "The npu_affine configuration of CPU_AFFINITY_CONF will be disabled.");
-    return "";
-}
-
-std::unordered_map<int, c10_npu::CoreIdRange> CardIdAffinityCPU;
-
-c10_npu::CoreIdRange parseAffinityCPU(const std::string cpuString)
-{
-    size_t pos = cpuString.find("-");
-    if (pos != std::string::npos) {
-        std::string start = cpuString.substr(0, pos);
-        std::string end = cpuString.substr(pos + 1);
-        int startNum = stoi(start);
-        int endNum = stoi(end);
-        if (startNum < endNum) {
-            return c10_npu::CoreIdRange{startNum, endNum};
-        }
-    }
-    TORCH_CHECK(false, "affinity cpu " + cpuString + " is error ", PTA_ERROR(ErrCode::VALUE));
-}
-
-void GetExclusiveAffinityCPU()
-{
-    int ret = DcmiInit();
-    int device_count = 0;
-    int card_id_list[16];
-    int list_len = 16;
-    ret = c10_npu::dcmi::DcmiGetCardNumList(&device_count, card_id_list, list_len);
-    std::unordered_map<std::string, int> SameAffinityCpuNum;
-    std::map<int, std::string> CardIdAffinityCpuDefault;
-    for (int i = 0; i < device_count; i++) {
-        std::string affinity_cpu = GetAffinityCPUBaseInfo(i);
-        if (affinity_cpu.empty()) {
-            return;
-        }
-        CardIdAffinityCpuDefault[i] = affinity_cpu;
-        auto it = SameAffinityCpuNum.find(affinity_cpu);
-        if (it != SameAffinityCpuNum.end()) {
-            SameAffinityCpuNum[affinity_cpu] = it->second + 1;
-        } else {
-            SameAffinityCpuNum[affinity_cpu] = 1;
-        }
-    }
-    std::unordered_map<std::string, int> offsetMap;
-    for (const auto& it : CardIdAffinityCpuDefault) {
-        int card_id = it.first;
-        std::string affinity_cpu = it.second;
-        int same_num = 1;
-        auto find_same_affinity_cpu = SameAffinityCpuNum.find(affinity_cpu);
-        if (find_same_affinity_cpu != SameAffinityCpuNum.end()) {
-            same_num = find_same_affinity_cpu->second;
-        }
-        int offset = 0;
-        auto find_offset = offsetMap.find(affinity_cpu);
-        if (find_offset != offsetMap.end()) {
-            offset = find_offset->second;
-        }
-        c10_npu::CoreIdRange cpu_range = parseAffinityCPU(affinity_cpu);
-        unsigned int length = (cpu_range.end - cpu_range.start + 1) / static_cast<unsigned int>(same_num);
-        c10_npu::CoreIdRange exclusiveAffinityCpu = {
-            cpu_range.start + static_cast<unsigned int>(offset) * length,
-            (cpu_range.start + length - 1) + static_cast<unsigned int>(offset) * length};
-        offsetMap[affinity_cpu] = offset + 1;
-        CardIdAffinityCPU[card_id] = exclusiveAffinityCpu;
-    }
-}
-
-c10_npu::CoreIdRange GetAssignAffinityCPU(int card_id)
-{
-    GetExclusiveAffinityCPU();
-    if (CardIdAffinityCPU.empty()) {
-        return {0, 0};
-    }
-    auto it = CardIdAffinityCPU.find(card_id);
-    if (it != CardIdAffinityCPU.end()) {
-        return it->second;
-    }
-}
diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.h b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.h
deleted file mode 100644
index a0ce50a201..0000000000
--- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef THNP_GETAFFINITY_INC
-#define THNP_GETAFFINITY_INC
-#include <string>
-
-std::string GetAffinityCPUBaseInfo(int card_id);
-c10_npu::CoreIdRange parseAffinityCPU(const std::string cpuString);
-void GetExclusiveAffinityCPU();
-c10_npu::CoreIdRange GetAssignAffinityCPU(int card_id);
-
-#endif
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/interface/DcmiInterface.cpp b/torch_npu/csrc/core/npu/interface/DcmiInterface.cpp
deleted file mode 100644
index ab66dae274..0000000000
--- a/torch_npu/csrc/core/npu/interface/DcmiInterface.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-#include <string>
-#include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
-#include "torch_npu/csrc/core/npu/NPUException.h"
-#include "torch_npu/csrc/core/npu/interface/DcmiInterface.h"
-
-namespace c10_npu {
-namespace dcmi {
-
-#undef LOAD_FUNCTION
-#define LOAD_FUNCTION(funcName) \
-    REGISTER_FUNCTION(libdcmi, funcName)
-#undef GET_FUNC
-#define GET_FUNC(funcName) \
-    GET_FUNCTION(libdcmi, funcName)
-
-REGISTER_LIBRARY(libdcmi)
-LOAD_FUNCTION(dcmi_get_affinity_cpu_info_by_device_id)
-LOAD_FUNCTION(dcmi_init)
-LOAD_FUNCTION(dcmi_get_device_id_in_card)
-LOAD_FUNCTION(dcmi_get_card_num_list)
-
-int DcmiInit(void)
-{
-    using dcmiInitFunc = int(*)(void);
-    static dcmiInitFunc func = nullptr;
-    func = (dcmiInitFunc)GET_FUNC(dcmi_init);
-    if (func == nullptr) {
-        TORCH_CHECK(false, "Failed to find function dcmi_init, "
-                    " maybe your hdk version is too low, please upgrade it.", PTA_ERROR(ErrCode::NOT_FOUND))
-    }
-    return func();
-}
-
-int DcmiGetCardNumList(int *card_num, int *card_list, int list_len)
-{
-    using dcmiGetCardNumListFunc = int(*)(int *, int *, int);
-    static dcmiGetCardNumListFunc func = nullptr;
-    func = (dcmiGetCardNumListFunc)GET_FUNC(dcmi_get_card_num_list);
-    if (func == nullptr) {
-        TORCH_CHECK(false, "Failed to find function dcmi_get_card_num_list, "
-                    " maybe your hdk version is too low, please upgrade it.", PTA_ERROR(ErrCode::NOT_FOUND))
-    }
-    return func(card_num, card_list, list_len);
-}
-
-int DcmiGetAffinityCpuInfoByDeviceId(int card_id, int device_id, char *affinity_cpu, int *length)
-{
-    using dcmiGetAffinityCpuInfoByDeviceIdFunc = int(*)(int, int, char *, int *);
-    static dcmiGetAffinityCpuInfoByDeviceIdFunc func = nullptr;
-    func = (dcmiGetAffinityCpuInfoByDeviceIdFunc)GET_FUNC(dcmi_get_affinity_cpu_info_by_device_id);
-    if (func == nullptr) {
-        TORCH_CHECK(false, "Failed to find function dcmi_get_affinity_cpu_info_by_device_id, "
-                    " maybe your hdk version is too low, please upgrade it", PTA_ERROR(ErrCode::NOT_FOUND));
-    }
-    return func(card_id, device_id, affinity_cpu, length);
-}
-
-int DcmiGetDeviceIdInCard(int card_id, int *device_id_max, int *mcu_id, int *cpu_id)
-{
-    using dcmiGetDeviceIdInCardFunc = int(*)(int, int *, int *, int *);
-    static dcmiGetDeviceIdInCardFunc func = nullptr;
-    func = (dcmiGetDeviceIdInCardFunc)GET_FUNC(dcmi_get_device_id_in_card);
-    if (func == nullptr) {
-        TORCH_CHECK(false, "Failed to find function dcmi_get_device_id_in_card, "
-                    " maybe your hdk version is too low, please upgrade it", PTA_ERROR(ErrCode::NOT_FOUND))
-    }
-    return func(card_id, device_id_max, mcu_id, cpu_id);
-}
-
-}
-
-}
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/interface/DcmiInterface.h b/torch_npu/csrc/core/npu/interface/DcmiInterface.h
deleted file mode 100644
index 0388e32d32..0000000000
--- a/torch_npu/csrc/core/npu/interface/DcmiInterface.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include "third_party/dcmi/inc/dcmi_interface_api.h"
-
-namespace c10_npu {
-namespace dcmi {
-
-int DcmiInit(void);
-int DcmiGetCardNumList(int *card_num, int *card_list, int list_len);
-int DcmiGetAffinityCpuInfoByDeviceId(int card_id, int device_id, char *affinity_cpu, int *length);
-int DcmiGetDeviceIdInCard(int card_id, int *device_id_max, int *mcu_id, int *cpu_id);
-
-}
-
-}
\ No newline at end of file
-- 
Gitee


From d8b28eb8aa478d82aff3a65e9bbfcb970db57dfe Mon Sep 17 00:00:00 2001
From: chen_liqing <jiangyiwen5@huawei.com>
Date: Tue, 12 Aug 2025 15:15:12 +0800
Subject: [PATCH 7/7] Add debug info

---
 torch_npu/csrc/core/npu/NPUAffinityController.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index cb7d7a928e..d1a79ae2dc 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -257,6 +257,8 @@ namespace c10_npu {
 
         // bind_conf=1, bind cores averagely based on device_id
         if (bind_conf == 1) {
+            ASCEND_LOGD("Device %d set %s affinity to %d-%d success.",
+                        device_id, threadTypeToNameMap.at(current_thread_type).c_str(), ranges[device_id].start, ranges[device_id].end);
             return SetThreadAffinity(ranges[device_id], pthread_self());
         } else if (bind_conf == 2) {
             auto thread_core_map = GetCpuAffinityMap(device_id);
-- 
Gitee