diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index 6d97252ba45482cc4592ff707264a74fab44f67f..d8e24edf23fcd3b52beeebd701c5e50c1b8a9a8a 100755 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -30,6 +30,13 @@ extern "C" { #define ACL_CONTINUE_ON_FAILURE 0x00000000u #define ACL_STOP_ON_FAILURE 0x00000001u +#define ACL_RT_IPC_MEM_FLAG_DEFAULT 0x0UL +#define ACL_RT_IPC_MEM_FLAG_DISABLE_PID_VALIDATION 0x1UL +#define ACL_RT_IPC_MEM_FLAG_ENABLE_PEER_ACCESS 0x2UL + +#define ACL_RT_VMM_FLAG_DEFAULT 0x0UL +#define ACL_RT_VMM_FLAG_DISABLE_PID_VALIDATION 0x1UL + #define MAX_MEM_UCE_INFO_ARRAY_SIZE 128 constexpr int32_t DEVICE_UTILIZATION_NOT_SUPPORT = -1; diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index b64552b804c45fa761ab7b1fc25357b611853db0..fab09b77fc6dbfbce2f0e6e2793c75aa409d39b5 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -24,7 +24,6 @@ #include "torch_npu/csrc/core/npu/GetCANNInfo.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/NPUEvent.h" -#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h" #include "torch_npu/csrc/profiler/npu_profiler.h" #ifndef BUILD_LIBTORCH #include "torch_npu/csrc/sanitizer/NPUTrace.h" @@ -477,10 +476,7 @@ struct ExpandableSegment { if (!handle.shareableHandle) { uint64_t shareableHandle = 0; NPU_CHECK_ERROR(c10_npu::acl::AclrtMemExportToShareableHandle( - handle.handle, ACL_MEM_HANDLE_TYPE_NONE, 0, &shareableHandle)); - int32_t* pids = nullptr; - int pid_num = torch_npu::ipc::getPids(&pids); - NPU_CHECK_ERROR(c10_npu::acl::AclrtMemSetPidToShareableHandle(shareableHandle, pids, pid_num)); + handle.handle, ACL_MEM_HANDLE_TYPE_NONE, ACL_RT_VMM_FLAG_DISABLE_PID_VALIDATION, &shareableHandle)); handle.shareableHandle = shareableHandle; } uint64_t shandle = *handle.shareableHandle; @@ -572,9 +568,7 @@ private: // cannot call c10::npu::stream_synchronize because // it might grab the GIL which can lead to a deadlock // Locking order must be GIL -> Allocator Lock - if (stream_) { - NPU_CHECK_ERROR(aclrtSynchronizeStream(*stream_)); - } else { + { c10_npu::NPUGuard device_guard(device_); c10_npu::npuSynchronizeDevice(true); } @@ -1673,10 +1667,7 @@ public: auto it = ipc_handle_map.find(base_ptr); if (it == ipc_handle_map.end()) { NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemGetExportKey( - base_ptr, base_size, handle.data, ACL_IPC_HANDLE_SIZE, 0)); - int32_t* pids = nullptr; - size_t pid_num = torch_npu::ipc::getPids(&pids); - NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemSetImportPid(handle.data, pids, pid_num)); + base_ptr, base_size, handle.data, ACL_IPC_HANDLE_SIZE, ACL_RT_IPC_MEM_FLAG_DISABLE_PID_VALIDATION)); ipc_handle_map[base_ptr] = handle; } else { handle = it->second; @@ -3327,15 +3318,6 @@ public: ASCEND_LOGD("End empty cache with check_error = %d", check_error); } - void clearIpcHandles() override - { - std::lock_guard lock(ipcHandleMutex); - for (auto &handle : ipcHandles) { - NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(handle)); - } - ipcHandles.clear(); - } - void *getBaseAllocation(void *ptr, size_t *outSize) override { Block *block = get_allocated_block(ptr); @@ -3628,7 +3610,8 @@ public: if (type == SHAREABLE_NPU_MALLOC) { handle_str handle_r; ss.read(handle_r.data, ACL_IPC_HANDLE_SIZE); - NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemImportByKey(&npu_ipc_ptr_, handle_r.data, 0)); + NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemImportByKey( + &npu_ipc_ptr_, handle_r.data, ACL_RT_IPC_MEM_FLAG_ENABLE_PEER_ACCESS)); handle_s.assign(handle_r.data, ACL_IPC_HANDLE_SIZE); } else if (type == SHAREABLE_NPU_EXPANDABLE_SEGMENT) { expandable_segment_ = @@ -3648,7 +3631,10 @@ public: void clear() { if (npu_ipc_ptr_) { - c10_npu::NPUGuard device_guard(device_); + { + c10_npu::NPUGuard device_guard(device_); + c10_npu::npuSynchronizeDevice(true); + } NPU_CHECK_ERROR(c10_npu::acl::AclrtIpcMemClose(handle_s.c_str())); npu_ipc_ptr_ = nullptr; } diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index 30a42035ec1914d44e408e23520e3b593b2b37b6..a5368b705da1cf59e9ed40532283ad66872fec88 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -203,7 +203,6 @@ public: virtual bool initialized() = 0; virtual void setMemoryFraction(double fraction, int device) = 0; virtual void emptyCache(bool check_error) = 0; - virtual void clearIpcHandles() = 0; virtual void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) = 0; virtual void* getBaseAllocation(void* ptr, size_t* size) = 0; virtual void recordStream(const c10::DataPtr& ptr, c10_npu::NPUStream stream) = 0; @@ -311,11 +310,6 @@ C10_NPU_API inline void emptyCache(bool check_error = true) return get()->emptyCache(check_error); } -inline void clearIpcHandles() -{ - return get()->clearIpcHandles(); -} - inline void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) { return get()->cacheInfo(dev_id, cachedAndFree, largestBlock); diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp b/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp deleted file mode 100644 index 393b4706c60decfb6171dfb50d8670d92f74b102..0000000000000000000000000000000000000000 --- a/torch_npu/csrc/core/npu/NPUIPCPidManager.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h" -namespace torch_npu { -namespace ipc { - -int32_t* pids = nullptr; -size_t pid_num = 0; -size_t capacity = 0; - -void addPid(int pid) -{ - const size_t requiredCapacity = pid_num + 1; - - if (requiredCapacity > capacity) { - size_t newCapacity = capacity + 10; - - int32_t* newArray = new int32_t[newCapacity]; - for (int i = 0; i < pid_num; ++i) { - newArray[i] = pids[i]; - } - - delete[] pids; - pids = newArray; - capacity = newCapacity; - } - - pids[pid_num++] = static_cast(pid); -} - -size_t getPids(int32_t** ret_pids) -{ - *ret_pids = pids; - return pid_num; -} - -} // namespace ipc -} // namespace torch_npu \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPUIPCPidManager.h b/torch_npu/csrc/core/npu/NPUIPCPidManager.h deleted file mode 100644 index f27cd240d15723f743fbcefe7204c81588ca60b3..0000000000000000000000000000000000000000 --- a/torch_npu/csrc/core/npu/NPUIPCPidManager.h +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once -#include -#include - -namespace torch_npu { -namespace ipc { - -void addPid(int pid); -size_t getPids(int32_t** pids); - -} // namespace ipc -} // namespace torch_npu \ No newline at end of file diff --git a/torch_npu/csrc/ipc/StorageSharing.cpp b/torch_npu/csrc/ipc/StorageSharing.cpp index 18fdd4c5e0722bcde2133239e3ccf9c0f9ad6ba0..1de9d10dee47eab1625c89cae11b133442eb3c68 100644 --- a/torch_npu/csrc/ipc/StorageSharing.cpp +++ b/torch_npu/csrc/ipc/StorageSharing.cpp @@ -47,6 +47,7 @@ static PyObject* THNPStorage_shareNpu(PyObject* self, PyObject* args) } at::DeviceGuard device_guard(storage.device()); + c10_npu::LazySetDevice(storage.device().index()); THPObjectPtr tuple(PyTuple_New(8)); THPObjectPtr device(THPUtils_packInt32(storage.device().index())); THPObjectPtr _handle(Py_None); @@ -193,6 +194,7 @@ static PyObject* THNPStorage_newSharedNpu(PyObject* _unused, PyObject* args) const auto device = c10::checked_convert( THPUtils_unpackLong(_device), "c10::DeviceIndex"); c10_npu::NPUGuard device_guard(device); + c10_npu::LazySetDevice(device); if (PyObject_IsTrue(_event_sync_required)) { // TO BE DONE diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 9217bc30bcd5d2885926edaa15ffcc4619aceac8..d8ce4805e03596d3c67b1dd69af34903d94dd631 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -28,7 +28,6 @@ #include "torch_npu/csrc/core/npu/NPUQueue.h" #include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/NPUPeerToPeerAccess.h" -#include "torch_npu/csrc/core/npu/NPUIPCPidManager.h" #include "torch_npu/csrc/core/npu/NPUGuard.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" @@ -1740,43 +1739,6 @@ static PyObject* THNPModule_is_gte_cann_version(PyObject* self, PyObject *args) END_HANDLE_TH_ERRORS } -static PyObject* THNPModule_add_ipc_pid(PyObject* self, PyObject *args) -{ - HANDLE_TH_ERRORS - int pid; - if (!PyArg_ParseTuple(args, "i", &pid)) { - throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE)); - } - torch_npu::ipc::addPid(pid); - - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -static PyObject* THNPModule_get_ipc_pid(PyObject* self, PyObject *noargs) -{ - HANDLE_TH_ERRORS - int32_t pid; - NPU_CHECK_ERROR(c10_npu::acl::AclrtDeviceGetBareTgid(&pid)); - return THPUtils_packInt32(pid); - END_HANDLE_TH_ERRORS -} - -static PyObject* THNPModule_add_p2p_access(PyObject* self, PyObject *args) -{ - HANDLE_TH_ERRORS - int src_dev; - int dst_dev; - if (!PyArg_ParseTuple(args, "ii", &src_dev, &dst_dev)) { - throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE)); - } - bool warning_flag = false; - at_npu::native::NpuP2pCtrl::get_instance().get_p2p_access(src_dev, dst_dev, warning_flag); - - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - static PyObject* THNPModule_set_device_res_limit(PyObject* self, PyObject *args) { HANDLE_TH_ERRORS @@ -1884,9 +1846,6 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_clear_fft_plan_cache", (PyCFunction)THNPModule_npu_clear_fft_plan_cache, METH_NOARGS, nullptr}, {"_get_cann_version", (PyCFunction)THNPModule_get_cann_version, METH_O, nullptr}, {"_is_gte_cann_version", (PyCFunction)THNPModule_is_gte_cann_version, METH_VARARGS, nullptr}, - {"_add_ipc_pid", (PyCFunction)THNPModule_add_ipc_pid, METH_VARARGS, nullptr}, - {"_get_ipc_pid", (PyCFunction)THNPModule_get_ipc_pid, METH_NOARGS, nullptr}, - {"_add_p2p_access", (PyCFunction)THNPModule_add_p2p_access, METH_VARARGS, nullptr}, {"_npu_get_device_res_limit", (PyCFunction)THNPModule_get_device_res_limit, METH_VARARGS, nullptr}, {"_npu_set_device_res_limit", (PyCFunction)THNPModule_set_device_res_limit, METH_VARARGS, nullptr}, {"_npu_reset_device_res_limit", (PyCFunction)THNPModule_reset_device_res_limit, METH_O, nullptr}, diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index 7610374a3ba35297c97eac5d17dbd11cc3bba0b9..14ea0ce7e73dbe0b18c255b8678c3a23ad44c5bc 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -189,12 +189,6 @@ void NPUPluggableAllocator::emptyCache(bool check_error) } } -void NPUPluggableAllocator::clearIpcHandles() -{ - TORCH_NPU_WARN("NPUPluggableAllocator does not yet support clearIpcHandles. " - "If you need it, please file an issue describing your use case."); -} - void NPUPluggableAllocator::cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) { TORCH_NPU_WARN("NPUPluggableAllocator does not yet support cacheInfo. " diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h index 266db02a604c906f0e5a4abf6e07d0f407504613..a3691d48eefbaf3743f5ce29a304a0dab3560151 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.h +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h @@ -60,7 +60,6 @@ struct NPUPluggableAllocator bool initialized() override; void setMemoryFraction(double fraction, int device) override; void emptyCache(bool check_error) override; - void clearIpcHandles() override; void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) override; void* getBaseAllocation(void* ptr, size_t* size) override; void recordStream(const c10::DataPtr&, streamType stream) override;