From 7dc5cedb31b6465ef3428c9af9488e6be3e83e47 Mon Sep 17 00:00:00 2001 From: wangyunhang Date: Sun, 4 Feb 2024 17:08:53 +0800 Subject: [PATCH] Provide interface such as getDeviceStatus, resetPeakStats for NPUPluggableAllocator --- .../test_pluggable_allocator_extensions.py | 43 ++++++++++++++++++- .../pluggable_allocator_extensions.cpp | 20 +++++++++ torch_npu/csrc/npu/Module.cpp | 27 ++++++++++++ torch_npu/csrc/npu/NPUPluggableAllocator.cpp | 39 ++++++++++++++--- torch_npu/csrc/npu/NPUPluggableAllocator.h | 6 +++ 5 files changed, 127 insertions(+), 8 deletions(-) diff --git a/test/allocator/test_pluggable_allocator_extensions.py b/test/allocator/test_pluggable_allocator_extensions.py index 99cc499a93c..7f6ef658cd3 100644 --- a/test/allocator/test_pluggable_allocator_extensions.py +++ b/test/allocator/test_pluggable_allocator_extensions.py @@ -2,6 +2,7 @@ import os import sys import shutil import subprocess +import ctypes import torch import torch.utils.cpp_extension @@ -27,6 +28,7 @@ def build_stub(base_dir): class TestPluggableAllocator(TestCase): module = None + new_alloc = None build_directory = "allocator/build" @classmethod @@ -59,9 +61,9 @@ class TestPluggableAllocator(TestCase): def test_pluggable_allocator(self): os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') # Load the allocator - new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free') + TestPluggableAllocator.new_alloc = torch_npu.npu.memory.NPUPluggableAllocator(os_path, 'my_malloc', 'my_free') # Swap the current allocator - torch_npu.npu.memory.change_current_allocator(new_alloc) + torch_npu.npu.memory.change_current_allocator(TestPluggableAllocator.new_alloc) # This will allocate memory in the device using the new allocator self.assertFalse(self.module.check_custom_allocator_used()) npu_tensor = torch.zeros(10, device='npu') @@ -69,6 +71,43 @@ class TestPluggableAllocator(TestCase): self.assertRtolEqual(npu_tensor.cpu().numpy(), cpu_tensor.numpy()) self.assertTrue(self.module.check_custom_allocator_used()) + def test_set_get_device_stats_fn(self): + os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') + myallocator = ctypes.CDLL(os_path) + get_device_stats_fn = ctypes.cast(getattr(myallocator, "my_get_device_stats"), ctypes.c_void_p).value + + msg = "get_device_stats_fn_ is not define, please set by set_get_device_stats_fn" + with self.assertRaisesRegex(RuntimeError, msg): + torch.npu.memory_stats_as_nested_dict() + + TestPluggableAllocator.new_alloc.allocator().set_get_device_stats_fn(get_device_stats_fn) + self.assertEqual(torch.npu.memory_stats_as_nested_dict()["num_alloc_retries"], 0) + + def test_set_reset_peak_status_fn(self): + os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') + myallocator = ctypes.CDLL(os_path) + reset_peak_status_fn = ctypes.cast(getattr(myallocator, "my_reset_peak_status"), ctypes.c_void_p).value + + msg = "reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn" + with self.assertRaisesRegex(RuntimeError, msg): + torch.npu.reset_peak_memory_stats() + + TestPluggableAllocator.new_alloc.allocator().set_reset_peak_status_fn(reset_peak_status_fn) + torch.npu.reset_peak_memory_stats() + self.assertEqual(torch.npu.max_memory_allocated(), 0) + + def test_set_snapshot_fn(self): + os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') + myallocator = ctypes.CDLL(os_path) + snapshot_fn = ctypes.cast(getattr(myallocator, "my_snapshot"), ctypes.c_void_p).value + + msg = "snapshot_fn_ is not define, please set by set_snapshot_fn" + with self.assertRaisesRegex(RuntimeError, msg): + torch.npu.memory_snapshot() + + TestPluggableAllocator.new_alloc.allocator().set_snapshot_fn(snapshot_fn) + self.assertEqual(torch.npu.memory_snapshot(), []) + def test_pluggable_allocator_after_init(self): os_path = os.path.join(TestPluggableAllocator.build_directory, 'pluggable_allocator_extensions.so') # Do an initial memory allocator diff --git a/test/cpp_extensions/pluggable_allocator_extensions.cpp b/test/cpp_extensions/pluggable_allocator_extensions.cpp index 3ed2606b021..2abb3241efe 100644 --- a/test/cpp_extensions/pluggable_allocator_extensions.cpp +++ b/test/cpp_extensions/pluggable_allocator_extensions.cpp @@ -4,8 +4,11 @@ #include "third_party/acl/inc/acl/acl_base.h" #include "third_party/acl/inc/acl/acl_rt.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" extern "C" { +using c10_npu::NPUCachingAllocator::DeviceStats; +using c10_npu::NPUCachingAllocator::SegmentInfo; static bool useflag = false; void* my_malloc(ssize_t size, int device, aclrtStream stream) @@ -27,6 +30,23 @@ bool check_custom_allocator_used() { return useflag; } + +DeviceStats my_get_device_stats(int device) +{ + DeviceStats stats; + return stats; +} + +void my_reset_peak_status(int device) +{ + std::cout<<"resetPeakStatus success!"< my_snapshot() +{ + std::vector result; + return result; +} } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 9acd640b7b2..6a82962f6ba 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -189,6 +189,33 @@ void RegisterNpuPluggableAllocator(PyObject* module) std::function func = reinterpret_cast(func_ptr); self.set_erase_stream_fn(func); + }) + .def( + "set_get_device_stats_fn", + [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self, + uint64_t func_ptr) { + using FuncType=c10_npu::NPUCachingAllocator::DeviceStats(int); + std::function func = + reinterpret_cast(func_ptr); + self.set_get_device_stats_fn(func); + }) + .def( + "set_reset_peak_status_fn", + [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self, + uint64_t func_ptr) { + using FuncType = void(int); + std::function func = + reinterpret_cast(func_ptr); + self.set_reset_peak_status_fn(func); + }) + .def( + "set_snapshot_fn", + [](torch::npu::NPUPluggableAllocator::NPUPluggableAllocator& self, + uint64_t func_ptr) { + using FuncType = std::vector(); + std::function func = + reinterpret_cast(func_ptr); + self.set_snapshot_fn(func); }); m.def("_npu_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) { using MallocFuncType = void*(size_t, int, aclrtStream); diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index 8cf406af5be..5f1f782ecb1 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -74,6 +74,24 @@ void NPUPluggableAllocator::set_erase_stream_fn( erase_stream_fn_ = std::move(erase_stream_fn); } +void NPUPluggableAllocator::set_get_device_stats_fn( + std::function get_device_stats_fn) +{ + get_device_stats_fn_ = std::move(get_device_stats_fn); +} + +void NPUPluggableAllocator::set_reset_peak_status_fn( + std::function reset_peak_status_fn) +{ + reset_peak_status_fn_ = std::move(reset_peak_status_fn); +} + +void NPUPluggableAllocator::set_snapshot_fn( + std::function()> snapshot_fn) +{ + snapshot_fn_ = std::move(snapshot_fn); +} + void* NPUPluggableAllocator::malloc( size_t size, int device, @@ -206,8 +224,11 @@ void NPUPluggableAllocator::eraseStream( c10_npu::NPUCachingAllocator::DeviceStats NPUPluggableAllocator::getDeviceStats(int device) { - TORCH_NPU_WARN("NPUPluggableAllocator does not yet support getDeviceStats. " - "If you need it, please file an issue describing your use case."); + if (get_device_stats_fn_) { + return get_device_stats_fn_(device); + } else { + TORCH_CHECK(false, "get_device_stats_fn_ is not define, please set by set_get_device_stats_fn"); + } } void NPUPluggableAllocator::resetAccumulatedStats(int device) @@ -218,14 +239,20 @@ void NPUPluggableAllocator::resetAccumulatedStats(int device) void NPUPluggableAllocator::resetPeakStats(int device) { - TORCH_NPU_WARN("NPUPluggableAllocator does not yet support resetPeakStats. " - "If you need it, please file an issue describing your use case."); + if (reset_peak_status_fn_) { + reset_peak_status_fn_(device); + } else { + TORCH_CHECK(false, "reset_peak_status_fn_ is not define, please set by set_reset_peak_status_fn"); + } } std::vector NPUPluggableAllocator::snapshot() { - TORCH_NPU_WARN("NPUPluggableAllocator does not yet support snapshot. " - "If you need it, please file an issue describing your use case."); + if (snapshot_fn_) { + return snapshot_fn_(); + } else { + TORCH_CHECK(false, "snapshot_fn_ is not define, please set by set_snapshot_fn"); + } } void NPUPluggableAllocator::FreeDeviceCachedMemory(int device) diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h index 94738078c7c..8e143c8775f 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.h +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h @@ -45,6 +45,9 @@ struct NPUPluggableAllocator std::function record_stream_fn); void set_erase_stream_fn( std::function erase_stream_fn); + void set_get_device_stats_fn(std::function get_device_stats_fn); + void set_reset_peak_status_fn(std::function reset_peak_status_fn); + void set_snapshot_fn(std::function()> snapshot_fn); void* malloc(size_t size, int device, aclrtStream stream); c10::DataPtr allocate(size_t size) const override; @@ -79,6 +82,9 @@ protected: std::function base_alloc_fn_; std::function record_stream_fn_; std::function erase_stream_fn_; + std::function get_device_stats_fn_; + std::function reset_peak_status_fn_; + std::function()> snapshot_fn_; std::mutex allocator_mutex_; // We do the bookeeping here in order to simplify custom allocators std::unordered_map allocation_metadata_; -- Gitee