From c3a1c9651668f8f33006aa01f9e3f1a4108722cf Mon Sep 17 00:00:00 2001 From: wangyu0516 Date: Wed, 6 Apr 2022 10:40:52 +0800 Subject: [PATCH] release resource before shutdown --- torch_npu/__init__.py | 5 +++++ torch_npu/csrc/distributed/Init.cpp | 7 ++++++- torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 6 ++++++ torch_npu/csrc/distributed/ProcessGroupHCCL.hpp | 2 ++ torch_npu/distributed/distributed_c10d.py | 6 +++++- 5 files changed, 24 insertions(+), 2 deletions(-) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 64554f1081..63da6c4dcd 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -98,6 +98,11 @@ apply_class_patches() # NPU exit, need to synchronize devices def _npu_shutdown(): + if torch.npu.is_available() and \ + torch.npu.is_initialized() and \ + torch.distributed.is_available() and \ + torch.distributed.is_initialized(): + torch.distributed.release_process_group() torch_npu._C._npu_shutdown() diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index e8717648c5..2f16f54fc9 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -299,7 +299,12 @@ PyObject* c10d_init(PyObject* _unused, PyObject* noargs) { py::arg("rank"), py::arg("size"), py::arg("timeout") = std::chrono::milliseconds( - ::c10d_npu::ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis)); + ::c10d_npu::ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis)) + .def("release_resource", + [](::c10d_npu::ProcessGroupHCCL& pg) { + pg.release_resource(); + }, + py::call_guard()); intrusive_ptr_class_<::c10d_npu::ProcessGroupHCCL::Options>( processGroupHCCL, "Options") diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 31d394afcc..a4ae31b4f5 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -723,4 +723,10 @@ c10::intrusive_ptr ProcessGroupHCCL::recvAnysource( int /* unused */) { throw std::runtime_error("ProcessGroupHCCL does not support recv"); } + +void ProcessGroupHCCL::release_resource() { + c10::npu::npuSynchronizeDevice(); + this->hcclEvents_.clear(); + this->devHCCLCommMap_.clear(); +} } // namespace c10d_npu diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index 13b1da5eb7..720dcfba91 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -248,6 +248,8 @@ public: static const int64_t kProcessGroupHCCLOpTimeoutMillis; + void release_resource(); + protected: // Helper that broadcasts HCCL Master ID to all ranks through the store void broadcastMasterID(HcclRootInfo* hcclID); diff --git a/torch_npu/distributed/distributed_c10d.py b/torch_npu/distributed/distributed_c10d.py index 7eb05691fa..7b7d713dbe 100644 --- a/torch_npu/distributed/distributed_c10d.py +++ b/torch_npu/distributed/distributed_c10d.py @@ -72,7 +72,7 @@ __all__ = [ "isend", "irecv", "send", "recv", "P2POp", "batch_isend_irecv", "broadcast", "all_reduce", "all_reduce_coalesced", "reduce", "all_gather", "all_gather_coalesced", "gather", "scatter", "reduce_scatter", "all_to_all_single", "all_to_all", "barrier", "new_group", "ProcessGroupHCCL", - "_get_default_group" + "_get_default_group", "release_process_group" ] # Some reduce ops are not supported by complex numbers and will result in an error. @@ -1827,3 +1827,7 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None): _store_based_barrier(global_rank, default_store, timeout) return pg + +def release_process_group(): + if _default_pg is not None and is_hccl_available(): + _default_pg.release_resource() -- Gitee