From 7dc5cedb31b6465ef3428c9af9488e6be3e83e47 Mon Sep 17 00:00:00 2001
From: wangyunhang <wangyunhang@huawei.com>
Date: Sun, 4 Feb 2024 17:08:53 +0800
Subject: [PATCH] Provide interface such as getDeviceStatus, resetPeakStats for
 NPUPluggableAllocator

---
 .../test_pluggable_allocator_extensions.py    | 43 ++++++++++++++++++-
 .../pluggable_allocator_extensions.cpp        | 20 +++++++++
 torch_npu/csrc/npu/Module.cpp                 | 27 ++++++++++++
 torch_npu/csrc/npu/NPUPluggableAllocator.cpp  | 39 ++++++++++++++---
 torch_npu/csrc/npu/NPUPluggableAllocator.h    |  6 +++
 5 files changed, 127 insertions(+), 8 deletions(-)

diff --git a/test/allocator/test_pluggable_allocator_extensions.py b/test/allocator/test_pluggable_allocator_extensions.py
index 99cc499a93c..7f6ef658cd3 100644
--- a/test/allocator/test_pluggable_allocator_extensions.py
+++ b/test/allocator/test_pluggable_allocator_extensions.py
@@ -2,6 +2,7 @@ import os
 import sys
 import shutil
 import subprocess
+import ctypes
 import torch
 import torch.utils.cpp_extension
 
@@ -27,6 +28,7 @@ def build_stub(base_dir):
 
 class TestPluggableAllocator(TestCase):
     module = None
+    new_alloc = None
     build_directory = "allocator/build"
 
     @classmethod
@@ -59,9 +61,9 @@ class TestPluggableAllocator(TestCase):
     def test_pluggable_allocator(self):
         os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so')
         # Load the allocator
-        new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free')
+        TestPluggableAllocator.new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free')
         # Swap the current allocator
-        torch_npu.npu.memory.change_current_allocator(new_alloc)
+        torch_npu.npu.memory.change_current_allocator(TestPluggableAllocator.new_alloc)
         # This will allocate memory in the device using the new allocator
         self.assertFalse(self.module.check_custom_allocator_used())
         npu_tensor = torch.zeros(10, device='npu')
@@ -69,6 +71,43 @@ class TestPluggableAllocator(TestCase):
         self.assertRtolEqual(npu_tensor.cpu().numpy(), cpu_tensor.numpy())
         self.assertTrue(self.module.check_custom_allocator_used())
 
+    def test_set_get_device_stats_fn(self):
+        os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so')
+        myallocator = ctypes.CDLL(os_path)
+        get_device_stats_fn = ctypes.cast(getattr(myallocator, "my_get_device_stats"), ctypes.c_void_p).value
+
+        msg = "get_device_stats_fn_ is not define, please set by set_get_device_stats_fn"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            torch.npu.memory_stats_as_nested_dict()
+
+        TestPluggableAllocator.new_alloc.allocator().set_get_device_stats_fn(get_device_stats_fn)
+        self.assertEqual(torch.npu.memory_stats_as_nested_dict()["num_alloc_retries"], 0)
+
+    def test_set_reset_peak_status_fn(self):
+        os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so')
+        myallocator = ctypes.CDLL(os_path)
+        reset_peak_status_fn = ctypes.cast(getattr(myallocator, "my_reset_peak_status"), ctypes.c_void_p).value
+
+        msg = "reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            torch.npu.reset_peak_memory_stats()
+
+        TestPluggableAllocator.new_alloc.allocator().set_reset_peak_status_fn(reset_peak_status_fn)
+        torch.npu.reset_peak_memory_stats()
+        self.assertEqual(torch.npu.max_memory_allocated(), 0)
+
+    def test_set_snapshot_fn(self):
+        os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so')
+        myallocator = ctypes.CDLL(os_path)
+        snapshot_fn = ctypes.cast(getattr(myallocator, "my_snapshot"), ctypes.c_void_p).value
+
+        msg = "snapshot_fn_ is not define, please set by set_snapshot_fn"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            torch.npu.memory_snapshot()
+
+        TestPluggableAllocator.new_alloc.allocator().set_snapshot_fn(snapshot_fn)
+        self.assertEqual(torch.npu.memory_snapshot(), [])
+
     def test_pluggable_allocator_after_init(self):
         os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so')
         # Do an initial memory allocator
diff --git a/test/cpp_extensions/pluggable_allocator_extensions.cpp b/test/cpp_extensions/pluggable_allocator_extensions.cpp
index 3ed2606b021..2abb3241efe 100644
--- a/test/cpp_extensions/pluggable_allocator_extensions.cpp
+++ b/test/cpp_extensions/pluggable_allocator_extensions.cpp
@@ -4,8 +4,11 @@
 
 #include "third_party/acl/inc/acl/acl_base.h"
 #include "third_party/acl/inc/acl/acl_rt.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 
 extern "C" {
+using c10_npu::NPUCachingAllocator::DeviceStats;
+using c10_npu::NPUCachingAllocator::SegmentInfo;
 static bool useflag = false;
 
 void* my_malloc(ssize_t size, int device, aclrtStream stream)
@@ -27,6 +30,23 @@ bool check_custom_allocator_used()
 {
     return useflag;
 }
+
+DeviceStats my_get_device_stats(int device)
+{
+    DeviceStats stats;
+    return stats;
+}
+
+void my_reset_peak_status(int device)
+{
+    std::cout<<"resetPeakStatus success!"<<std::endl;
+}
+
+std::vector<SegmentInfo> my_snapshot()
+{
+    std::vector<SegmentInfo> result;
+    return result;
+}
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 9acd640b7b2..6a82962f6ba 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -189,6 +189,33 @@ void RegisterNpuPluggableAllocator(PyObject* module)
             std::function<FuncType> func =
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_erase_stream_fn(func);
+        })
+        .def(
+        "set_get_device_stats_fn",
+        [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self,
+            uint64_t func_ptr) {
+            using FuncType=c10_npu::NPUCachingAllocator::DeviceStats(int);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_get_device_stats_fn(func);
+        })
+        .def(
+        "set_reset_peak_status_fn",
+        [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self,
+            uint64_t func_ptr) {
+            using FuncType = void(int);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_reset_peak_status_fn(func);
+        })
+        .def(
+        "set_snapshot_fn",
+        [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self,
+            uint64_t func_ptr) {
+            using FuncType = std::vector<c10_npu::NPUCachingAllocator::SegmentInfo>();
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_snapshot_fn(func);
         });
     m.def("_npu_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
         using MallocFuncType = void*(size_t, int, aclrtStream);
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
index 8cf406af5be..5f1f782ecb1 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
@@ -74,6 +74,24 @@ void NPUPluggableAllocator::set_erase_stream_fn(
     erase_stream_fn_ = std::move(erase_stream_fn);
 }
 
+void NPUPluggableAllocator::set_get_device_stats_fn(
+    std::function<c10_npu::NPUCachingAllocator::DeviceStats(int)> get_device_stats_fn)
+{
+    get_device_stats_fn_ = std::move(get_device_stats_fn);
+}
+
+void NPUPluggableAllocator::set_reset_peak_status_fn(
+    std::function<void(int)> reset_peak_status_fn)
+{
+    reset_peak_status_fn_ = std::move(reset_peak_status_fn);
+}
+
+void NPUPluggableAllocator::set_snapshot_fn(
+    std::function<std::vector<c10_npu::NPUCachingAllocator::SegmentInfo>()> snapshot_fn)
+{
+    snapshot_fn_ = std::move(snapshot_fn);
+}
+
 void* NPUPluggableAllocator::malloc(
     size_t size,
     int device,
@@ -206,8 +224,11 @@ void NPUPluggableAllocator::eraseStream(
 
 c10_npu::NPUCachingAllocator::DeviceStats NPUPluggableAllocator::getDeviceStats(int device)
 {
-    TORCH_NPU_WARN("NPUPluggableAllocator does not yet support getDeviceStats. "
-                  "If you need it, please file an issue describing your use case.");
+    if (get_device_stats_fn_) {
+        return get_device_stats_fn_(device);
+    } else {
+        TORCH_CHECK(false, "get_device_stats_fn_ is not define, please set by set_get_device_stats_fn");
+    }
 }
 
 void NPUPluggableAllocator::resetAccumulatedStats(int device)
@@ -218,14 +239,20 @@ void NPUPluggableAllocator::resetAccumulatedStats(int device)
 
 void NPUPluggableAllocator::resetPeakStats(int device)
 {
-    TORCH_NPU_WARN("NPUPluggableAllocator does not yet support resetPeakStats. "
-                  "If you need it, please file an issue describing your use case.");
+    if (reset_peak_status_fn_) {
+        reset_peak_status_fn_(device);
+    } else {
+        TORCH_CHECK(false, "reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn");
+    }
 }
 
 std::vector<c10_npu::NPUCachingAllocator::SegmentInfo> NPUPluggableAllocator::snapshot()
 {
-    TORCH_NPU_WARN("NPUPluggableAllocator does not yet support snapshot. "
-                  "If you need it, please file an issue describing your use case.");
+    if (snapshot_fn_) {
+        return snapshot_fn_();
+    } else {
+        TORCH_CHECK(false, "snapshot_fn_ is not define, please set by set_snapshot_fn");
+    }
 }
 
 void NPUPluggableAllocator::FreeDeviceCachedMemory(int device)
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h
index 94738078c7c..8e143c8775f 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.h
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h
@@ -45,6 +45,9 @@ struct NPUPluggableAllocator
         std::function<void(void* ptr, aclrtStream stream)> record_stream_fn);
     void set_erase_stream_fn(
         std::function<void(void* ptr, aclrtStream stream)> erase_stream_fn);
+    void set_get_device_stats_fn(std::function<c10_npu::NPUCachingAllocator::DeviceStats(int)> get_device_stats_fn);
+    void set_reset_peak_status_fn(std::function<void(int)> reset_peak_status_fn);
+    void set_snapshot_fn(std::function<std::vector<c10_npu::NPUCachingAllocator::SegmentInfo>()> snapshot_fn);
     void* malloc(size_t size, int device, aclrtStream stream);
 
     c10::DataPtr allocate(size_t size) const override;
@@ -79,6 +82,9 @@ protected:
     std::function<void*(void*, size_t*)> base_alloc_fn_;
     std::function<void(void* ptr, aclrtStream stream)> record_stream_fn_;
     std::function<void(void* ptr, aclrtStream stream)> erase_stream_fn_;
+    std::function<c10_npu::NPUCachingAllocator::DeviceStats(int)> get_device_stats_fn_;
+    std::function<void(int)> reset_peak_status_fn_;
+    std::function<std::vector<c10_npu::NPUCachingAllocator::SegmentInfo>()> snapshot_fn_;
     std::mutex allocator_mutex_;
     // We do the bookeeping here in order to simplify custom allocators
     std::unordered_map<void*, _AllocationMetadata> allocation_metadata_;
-- 
Gitee