From c3a1c9651668f8f33006aa01f9e3f1a4108722cf Mon Sep 17 00:00:00 2001
From: wangyu0516 <wangyu233@hisilicon.com>
Date: Wed, 6 Apr 2022 10:40:52 +0800
Subject: [PATCH] release resource before shutdown

---
 torch_npu/__init__.py                           | 5 +++++
 torch_npu/csrc/distributed/Init.cpp             | 7 ++++++-
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 6 ++++++
 torch_npu/csrc/distributed/ProcessGroupHCCL.hpp | 2 ++
 torch_npu/distributed/distributed_c10d.py       | 6 +++++-
 5 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 64554f1081..63da6c4dcd 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -98,6 +98,11 @@ apply_class_patches()
 
 # NPU exit, need to synchronize devices
 def _npu_shutdown():
+    if torch.npu.is_available() and \
+            torch.npu.is_initialized() and \
+            torch.distributed.is_available() and \
+            torch.distributed.is_initialized():
+        torch.distributed.release_process_group()
     torch_npu._C._npu_shutdown()
 
 
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index e8717648c5..2f16f54fc9 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -299,7 +299,12 @@ PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
            py::arg("rank"),
            py::arg("size"),
            py::arg("timeout") = std::chrono::milliseconds(
-               ::c10d_npu::ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis));
+               ::c10d_npu::ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis))
+      .def("release_resource",
+           [](::c10d_npu::ProcessGroupHCCL& pg) {
+             pg.release_resource();
+           },
+           py::call_guard<py::gil_scoped_release>());
 
   intrusive_ptr_class_<::c10d_npu::ProcessGroupHCCL::Options>(
       processGroupHCCL, "Options")
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 31d394afcc..a4ae31b4f5 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -723,4 +723,10 @@ c10::intrusive_ptr<c10d::ProcessGroup::Work> ProcessGroupHCCL::recvAnysource(
     int /* unused */) {
   throw std::runtime_error("ProcessGroupHCCL does not support recv");
 }
+
+void ProcessGroupHCCL::release_resource() {
+  c10::npu::npuSynchronizeDevice();
+  this->hcclEvents_.clear();
+  this->devHCCLCommMap_.clear();
+}
 } // namespace c10d_npu
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index 13b1da5eb7..720dcfba91 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -248,6 +248,8 @@ public:
 
   static const int64_t kProcessGroupHCCLOpTimeoutMillis;
 
+  void release_resource();
+
 protected:
   // Helper that broadcasts HCCL Master ID to all ranks through the store
   void broadcastMasterID(HcclRootInfo* hcclID);
diff --git a/torch_npu/distributed/distributed_c10d.py b/torch_npu/distributed/distributed_c10d.py
index 7eb05691fa..7b7d713dbe 100644
--- a/torch_npu/distributed/distributed_c10d.py
+++ b/torch_npu/distributed/distributed_c10d.py
@@ -72,7 +72,7 @@ __all__ = [
     "isend", "irecv", "send", "recv", "P2POp", "batch_isend_irecv", "broadcast", "all_reduce",
     "all_reduce_coalesced", "reduce", "all_gather", "all_gather_coalesced", "gather", "scatter",
     "reduce_scatter", "all_to_all_single", "all_to_all", "barrier", "new_group", "ProcessGroupHCCL",
-    "_get_default_group"
+    "_get_default_group", "release_process_group"
 ]
 
 # Some reduce ops are not supported by complex numbers and will result in an error.
@@ -1827,3 +1827,7 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None):
         _store_based_barrier(global_rank, default_store, timeout)
 
     return pg
+
+def release_process_group():
+    if _default_pg is not None and is_hccl_available():
+        _default_pg.release_resource()
-- 
Gitee