From 935e03829be7f5a51c8aad3adb908187cbe8e5df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Thu, 12 Jun 2025 15:50:37 +0800
Subject: [PATCH 1/8] add  config PYTORCH_NPU_ALLOC_CONF = page_size:16g

---
 .../pluggable_allocator_extensions.cpp        |   2 +-
 third_party/acl/inc/acl/acl_rt.h              |   2 +
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 311 ++++++++++++++----
 torch_npu/csrc/core/npu/NPUCachingAllocator.h |   2 +
 .../csrc/core/npu/NPUWorkspaceAllocator.cpp   |   5 +-
 .../csrc/core/npu/interface/AclInterface.cpp  |  62 +++-
 .../csrc/core/npu/interface/AclInterface.h    |   7 +-
 torch_npu/csrc/npu/Stress_detect.cpp          |   6 +-
 8 files changed, 323 insertions(+), 74 deletions(-)

diff --git a/test/cpp_extensions/pluggable_allocator_extensions.cpp b/test/cpp_extensions/pluggable_allocator_extensions.cpp
index c17077116b..4bf1ff1099 100644
--- a/test/cpp_extensions/pluggable_allocator_extensions.cpp
+++ b/test/cpp_extensions/pluggable_allocator_extensions.cpp
@@ -14,7 +14,7 @@ static bool useflag = false;
 void* my_malloc(ssize_t size, int device, aclrtStream stream)
 {
     void *ptr;
-    aclrtMallocAlign32(&ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST);
+    aclrtMallocAlign32(&ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {});
     std::cout<<"alloc ptr = "<<ptr<<", size = "<<size<<std::endl;
     useflag = true;
     return ptr;
diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h
index 98b520ba4a..e91f466f4e 100755
--- a/third_party/acl/inc/acl/acl_rt.h
+++ b/third_party/acl/inc/acl/acl_rt.h
@@ -89,6 +89,7 @@ typedef enum aclrtMemMallocPolicy {
     ACL_MEM_MALLOC_NORMAL_ONLY_P2P,
     ACL_MEM_MALLOC_HUGE1G_ONLY,
     ACL_MEM_MALLOC_HUGE1G_ONLY_P2P,
+    ACL_MEM_MALLOC_HUGE16G_ONLY,
     ACL_MEM_TYPE_LOW_BAND_WIDTH   = 0x0100,
     ACL_MEM_TYPE_HIGH_BAND_WIDTH  = 0x1000,
 } aclrtMemMallocPolicy;
@@ -110,6 +111,7 @@ typedef enum aclrtMemAttr {
     ACL_HBM_MEM_P2P_NORMAL,
     ACL_HBM_MEM_HUGE1G,
     ACL_HBM_MEM_P2P_HUGE1G,
+    ACL_HBM_MEM_HUGE16G,
 } aclrtMemAttr;
 
 typedef enum aclrtGroupAttr {
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 74afc22031..70245b856c 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -91,6 +91,7 @@ constexpr size_t kSmallBuffer = 2097152;              // "small" allocations are
 constexpr size_t kLargeBuffer = 20971520;             // "large" allocations may be packed in 20 MiB blocks
 constexpr size_t kLargeBufferForHccl = 134217728;     // "large for hccl" allocations may be packed in 128 MiB blocks
 constexpr size_t kExtraLargeBuffer = 1073741824;      // "extra large" allocations may be packed in 1 GB blocks
+constexpr size_t kExtraLarge16GBuffer = 17179869184;      // "extra large" allocations may be packed in 16 GB blocks
 constexpr size_t kMinLargeAlloc = 10485760;           // allocations between 1 and 10 MiB may use kLargeBuffer
 constexpr size_t kRoundLarge = 2097152;               // round up large allocs to 2 MiB
 constexpr size_t kAlignRoundLarge = 16384;            // round up large allocs to 16 KB
@@ -98,7 +99,12 @@ constexpr size_t kSmallPoolVirAddrSize = 2147483648;  // 2 GB
 constexpr size_t kLargePoolVirAddrSize = 10737418240; // 10 GB
 const std::string kMinCannVersion = "8.1.RC1";        // minimum cann version which supports 1g mem 8.1.RC1
 const std::string kMinDriverVersion = "25.0.RC1";     // minimum driver version which supports 1g mem 25.0.RC1
+const std::string kMinCannVersionSupport16G = "8.2.RC1";        // minimum cann version which supports 16g mem 8.2.RC1
+const std::string kMinDriverVersionSupport16G = "25.1.RC1";     // minimum driver version which supports 16g mem 25.1.RC1
 const std::string kCannModule = "CANN";               // cann module name
+constexpr int used1GPage = 1;               // used 1g to alloc
+constexpr int used16GPage = 16;               // used 16g to alloc
+static uint32_t segmentSizeUsed = 1;  // segment size used
 
 using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
 
@@ -140,36 +146,89 @@ void update_stat_array(StatArray &stat_array, int64_t amount, const StatTypes &s
         [&stat_array, amount](size_t stat_type) { update_stat(stat_array[stat_type], amount); });
 }
 
-bool IsMallocPage1GMem(bool is_small_pool)
+
+
+bool IsSupport1GVersion()
 {
     static bool is_support_page_size_1g = []() {
-        if (!c10_npu::NPUCachingAllocator::isConfig1GPageSizeEnable()) {
+        if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) {
+            TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
+                "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g "
+                "or PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
+                "but the current CANN version does not support this feature. "
+                "Please upgrade the CANN package version.");
             return false;
         }
 
-        if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) {
+        if (!IsGteDriverVersion(kMinDriverVersion)) {
             TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
                 "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
-                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g "
+                "or PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
                 "but the current driver version does not support this feature. "
+                "Please upgrade the HDK(driver) package version.");
+            return false;
+        }
+        return true;
+    }();
+    return is_support_page_size_1g;
+}
+
+bool IsSupport16GVersion()
+{
+    static bool is_support_page_size_16g = []() {
+        if (!IsGteCANNVersion(kMinCannVersionSupport16G, kCannModule)) {
+            TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. "
+                "Using the ACL_HBM_MEM_HUGE1G memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
+                "but the current CANN version does not support this feature. "
                 "Please upgrade the CANN package version.");
             return false;
         }
 
-        if (!IsGteDriverVersion(kMinDriverVersion)) {
-            TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
+        if (!IsGteDriverVersion(kMinDriverVersionSupport16G)) {
+            TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. "
                 "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
-                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
                 "but the current driver version does not support this feature. "
                 "Please upgrade the HDK(driver) package version.");
             return false;
         }
         return true;
     }();
+    return is_support_page_size_16g;
+}
+
+int realUseAllocPageMem(){
+    if(c10_npu::NPUCachingAllocator::isConfig16GPageSizeEnable()){
 
-    return !is_small_pool && is_support_page_size_1g;
+        // 满足16g版本并且有足够的16g大页块申请
+        if (IsSupport16GVersion() && segmentSizeUsed == used16GPage) {
+            return used16GPage;
+        };
+        if(IsSupport1GVersion()){
+            TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. "
+                "Using the ACL_HBM_MEM_HUGE1G memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
+                "but the current driver and CANN version does not support this feature. "
+                "Please upgrade the HDK(driver) and CANN package version."
+                "The minimum version of the HDK (driver) needs to be greater than 25.1.RC1 "
+                "and The minimum version of the CANN needs to be greater than 8.2.RC1."
+                "Or there may not be enough 16g virtual memory blocks allocated.");
+            segmentSizeUsed=used1GPage;
+            return used1GPage;
+        };
+    }
+    if(c10_npu::NPUCachingAllocator::isConfig1GPageSizeEnable()){
+        if(IsSupport1GVersion()){
+            return used1GPage;
+        };
+    }
+    return 0;
 }
 
+
 struct Block;
 struct PrivatePool;
 using Comparison = bool (*)(const Block *, const Block *);
@@ -278,6 +337,13 @@ struct SegmentRange {
 };
 
 
+struct SegmentHandles {
+    aclrtDrvMemHandle drvMem_Handle;
+    size_t segment_size;
+    size_t segment_block_count;
+    size_t segment_block_head;
+};
+
 /*
 Note [Expandable Segments]
 Rationale
@@ -355,12 +421,13 @@ bevhavior for allocator tensors that need to be used cross-process.
 */
 
 struct ExpandableSegment {
-    ExpandableSegment(int device, aclrtStream stream, size_t size)
+    ExpandableSegment(int device, aclrtStream stream, size_t size,size_t blockCout)
         : device_(device),
           stream_(stream),
           max_handles_(0),
           // 2MB for small pool, 20MB for large pool
-          segment_size_(size)
+          segment_size_(size),
+          segment_block_count_(blockCout)
     {
         size_t device_free;
         size_t device_total;
@@ -382,9 +449,9 @@ struct ExpandableSegment {
         }
 
         NPU_CHECK_ERROR(
-            c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm()));
-        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, segment_size=%zu",
-            segment_size_ * max_handles_, segment_size_);
+            c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_block_count_*segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm()));
+        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, max_handles_=%zu,segment_size=%zu,segment_block_count_=%zu",
+                    segment_block_count_*segment_size_ * max_handles_, max_handles_,segment_size_,segment_block_count_);
     }
     // begin must be aligned to segment_size_.
     // returns the actual range mapped, which may be
@@ -392,6 +459,11 @@ struct ExpandableSegment {
     // return size of 0 indicates OOM
     SegmentRange map(SegmentRange range)
     {
+        if (realUseAllocPageMem() == used1GPage) {
+            segment_block_count_ = used1GPage;
+        } else if (realUseAllocPageMem() == used16GPage) {
+            segment_block_count_ = used16GPage;
+        }
         auto begin = segmentLeft(range.ptr);
         auto end = segmentRight(range.ptr + range.size);
         TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr, PTA_ERROR(ErrCode::PTR));
@@ -401,35 +473,86 @@ struct ExpandableSegment {
         while (end > handles_.size()) {
             handles_.emplace_back(c10::nullopt);
         }
+        size_t segment_len = end - begin;
+        long remain_size = range.size;
+        ASCEND_LOGD("NPUCachingAllocator map:begin=%zu, end=%zu, remain_size=%zu",begin,end, remain_size);
+        auto current_segment_size = segment_size_*segment_block_count_;
+        size_t realEnd = end;
         for (auto i : c10::irange(begin, end)) {
-            TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL));
+            //伪装降级
+//            if(segmentSizeUsed=16&&i=!0){
+//                current_segment_size=kExtraLargeBuffer;
+//                segmentSizeUsed=1;
+//            }
+            // TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL));
+            SegmentHandles segmentHandles;
             aclrtDrvMemHandle handle = nullptr;
             aclrtPhysicalMemProp prop = {};
             prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
             prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-            prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE;
+            prop.memAttr = ACL_HBM_MEM_HUGE;
+            if (realUseAllocPageMem() == used1GPage) {
+                prop.memAttr = ACL_HBM_MEM_HUGE1G;
+            } else if (realUseAllocPageMem() == used16GPage) {
+                prop.memAttr = ACL_HBM_MEM_HUGE16G;
+            }
             prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
             prop.location.id = static_cast<unsigned>(device_);
             prop.reserve = 0;
-            auto status = c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0);
-            if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
-                for (auto j : c10::irange(begin, i)) {
-                    auto h = handles_.at(j).value();
-                    handles_.at(j) = c10::nullopt;
-                    NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+            // 如果是16g的头部块，则进行物理内存的申请。或者如果失败了，则降级到1G
+            size_t headId = getSegmentBlockHead(begin,end,i);
+            ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remain_size=%zu",headId,i, current_segment_size,remain_size);
+            if (headId == i && remain_size >= 0) {
+                std::vector <size_t> segment_size_level;
+                segment_size_level.push_back(kExtraLargeBuffer);
+                auto status = c10_npu::acl::AclrtMallocPhysical(&handle, current_segment_size, &prop, 0,
+                                                                segment_size_level,
+                                                                &segmentSizeUsed, 0);
+                if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+                    for (auto j: c10::irange(begin, i)) {
+                        auto h = handles_.at(j).value().drvMem_Handle;
+                        handles_.at(j) = c10::nullopt;
+                        NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+                    }
+                    trimHandles();
+                    return rangeFromHandles(begin, begin);
                 }
-                trimHandles();
-                return rangeFromHandles(begin, begin);
+                NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
+                remain_size = remain_size - current_segment_size;
+                usedSegmentSizes_.push_back(current_segment_size);
+                segmentHandles.segment_block_head = headId;
+                segmentHandles.drvMem_Handle=handle;
+                segmentHandles.segment_size=segment_size_;
+                segmentHandles.segment_block_count=segmentSizeUsed;
+                handles_.at(i) = segmentHandles;
+            }
+
+            if(remain_size<0){
+                realEnd=i;
+                break;
             }
-            NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
-            handles_.at(i) = handle;
         }
-        for (auto i : c10::irange(begin, end)) {
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0,
-                handles_.at(i).value(), 0, getHcclComm()));
+        auto start_ptr = range.ptr;
+        size_t total_size = 0;
+        for (auto i: c10::irange(begin, end)) {
+            if(i>realEnd){
+                continue;
+            }
+            SegmentHandles segmentHandles=handles_.at(i).value();
+            ASCEND_LOGD("NPUCachingAllocator map:i=%zu, segmentHandles.segment_size=%zu, segmentHandles.drvMem_Handle=%zu",i,
+                        segmentHandles.segment_size*segmentHandles.segment_block_count,segmentHandles.drvMem_Handle==nullptr);
+            if(segmentHandles.drvMem_Handle!=nullptr){
+                size_t usedSegmentSizes= segmentHandles.segment_size*segmentHandles.segment_block_count;
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0,
+                                                          segmentHandles.drvMem_Handle, 0, getHcclComm()));
+                start_ptr=start_ptr+usedSegmentSizes;
+                total_size = total_size+usedSegmentSizes;
+                ASCEND_LOGD("NPUCachingAllocator map:i=%zu, segment_size=%zu,total_size=%zu",i, usedSegmentSizes_[i],total_size);
+            }
         }
-        ASCEND_LOGD("NPUCachingAllocator map: segment_size=%zu", segment_size_);
-        return rangeFromHandles(begin, end);
+        //return rangeFromHandles(begin, end);
+        ASCEND_LOGD("NPUCachingAllocator ================>total_size=%zu", total_size);
+        return SegmentRange(range.ptr, total_size);
     }
 
     // unmaps all the completely empty segment_size_ segments between
@@ -437,13 +560,14 @@ struct ExpandableSegment {
     // and the actual size unmapped (multiple of segment_size_)
     SegmentRange unmap(SegmentRange range)
     {
-        auto begin = segmentRight(range.ptr);
+        auto begin = unmapSegmentRight(range.ptr);
         auto end = segmentLeft(range.ptr + range.size);
+        ASCEND_LOGD("NPUCachingAllocator =unmap===============>begin=%zu,end=%zu,size=%zu", begin,end,range.size);
         if (begin >= end) {
             return SegmentRange{ range.ptr, 0 };
         }
-        unmapHandles(begin, end);
-        return rangeFromHandles(begin, end);
+        return unmapHandles(begin, end);
+//        return rangeFromHandles(begin, end);
     }
 
     char *ptr() const
@@ -453,7 +577,7 @@ struct ExpandableSegment {
 
     size_t size() const
     {
-        return max_handles_ * segment_size_;
+        return max_handles_ * segment_size_ * segment_block_count_;
     }
 
     void setHcclComm(std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
@@ -464,7 +588,7 @@ struct ExpandableSegment {
             segment_size_ * max_handles_, 0, 1));
         for (auto i : c10::irange(handles_.size())) {
             HCCL_CHECK_ERROR(at_npu::hccl::HcclCommActivateCommMemoryFace(hcclComm_->getHcclComm(),
-                (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value(), 0));
+                 (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value().drvMem_Handle, 0));
         }
     }
 
@@ -476,7 +600,7 @@ struct ExpandableSegment {
     }
 
 private:
-    void unmapHandles(size_t begin, size_t end)
+    SegmentRange unmapHandles(size_t begin, size_t end)
     {
         // note: unlike aclrtFree, MemUnmap and MemRelease do
         // not appear to synchronize in all cases, so we have to wait for the
@@ -492,14 +616,39 @@ private:
             trigger->traceNpuStreamSynchronization(reinterpret_cast<uintptr_t>(stream_));
         }
 #endif
+        size_t beginUnmap=0;
+        size_t endUnmap=0;
         for (auto i : c10::irange(begin, end)) {
-            aclrtDrvMemHandle h = handles_.at(i).value();
-            handles_.at(i) = c10::nullopt;
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm()));
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+            ASCEND_LOGD("NPUCachingAllocator unmap: handles_.at(i)=%zu", handles_.at(i)==c10::nullopt);
+            if (handles_.at(i) == c10::nullopt) {
+                continue;
+            }
+            SegmentHandles segmentHandles=handles_.at(i).value();
+            // i=headid,&&segment_block_count+i<=end===>释放这个handle
+            size_t segmentBlockCount=segmentHandles.segment_block_count;
+            size_t segmentBlockHead=segmentHandles.segment_block_head;
+            ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i,begin,end);
+            if(i==segmentBlockHead&&segmentBlockCount+i<=end){
+                if(beginUnmap==0){
+                    beginUnmap=segmentBlockHead;
+                }
+                endUnmap=segmentBlockHead+segmentBlockCount;
+                aclrtDrvMemHandle h = segmentHandles.drvMem_Handle;
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm()));
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+                ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i,segment_size_,begin,end);
+                handles_.at(i) = c10::nullopt;
+            }
+
         }
-        ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_);
         trimHandles();
+        if (beginUnmap == 0 && endUnmap == 0) {
+            beginUnmap = begin;
+            endUnmap = begin;
+        }
+        ASCEND_LOGD("NPUCachingAllocator unmapHandles: beginUnmap=%zu,endUnmap=%zu", beginUnmap,endUnmap);
+        return rangeFromHandles(beginUnmap, endUnmap);
+        //return SegmentRange(beginUnmap, endUnmap);
     }
 
     void trimHandles()
@@ -524,7 +673,8 @@ private:
 
     size_t numSegments(size_t size)
     {
-        return (size + segment_size_ - 1) / segment_size_;
+        ASCEND_LOGD("numSegments unmap: size=%zu,segment_size_=%zu,segment_block_count_=%zu,end=%zu", size,segment_size_,segment_block_count_);
+        return (size + segment_size_*segment_block_count_ - 1) / (segment_size_*segment_block_count_);
     }
 
     size_t segmentLeft(char *p)
@@ -536,11 +686,20 @@ private:
     size_t segmentRight(char *p)
     {
         auto size = p - ptr();
-        return numSegments(size);
+        return numSegments(size)*segment_block_count_;
     }
 
-    SegmentRange rangeFromHandles(size_t begin, size_t end)
+    size_t unmapSegmentRight(char *p)
     {
+        auto size = p - ptr();
+        return (size + segment_size_- 1) / (segment_size_);
+    }
+
+    size_t getSegmentBlockHead(size_t begin, size_t end, size_t index) {
+        return ((index - begin) / segmentSizeUsed) * segmentSizeUsed + begin;
+    }
+
+    SegmentRange rangeFromHandles(size_t begin, size_t end) {
         return SegmentRange(ptr() + segment_size_ * begin, segment_size_ * (end - begin));
     }
 
@@ -557,7 +716,9 @@ private:
     void *ptr_{};
     size_t max_handles_;
     size_t segment_size_;
-    std::vector<c10::optional<aclrtDrvMemHandle>> handles_;
+    size_t segment_block_count_;
+    std::vector<size_t> usedSegmentSizes_;
+    std::vector<c10::optional<SegmentHandles>> handles_;
     std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
 };
 
@@ -776,6 +937,10 @@ public:
     {
         return instance().m_page_size_1g;
     }
+    static bool page_size_16g_enable()
+    {
+        return instance().m_page_size_16g;
+    }
 
     static CachingAllocatorConfig &instance()
     {
@@ -797,6 +962,7 @@ private:
     bool set_expandable_segments_flag = false;
     size_t m_base_addr_aligned_size = kAlignRoundLarge;
     bool m_page_size_1g = false; // 新增1G页配置标志
+    bool m_page_size_16g = false; // 新增1G页配置标志
 
     CachingAllocatorConfig()
         : m_max_split_size(std::numeric_limits<size_t>::max()),
@@ -923,7 +1089,10 @@ size_t CachingAllocatorConfig::parsePageSize(const std::vector<std::string> &con
 
     if (config[i + 2] == "1g") {
         m_page_size_1g = true;
-    } else {
+    } else if(config[i + 2] == "16g"){
+        m_page_size_16g = true;
+        segmentSizeUsed=used16GPage;
+    }else {
         TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE));
     }
     return i + 2; // 返回最后处理的索引位置
@@ -987,6 +1156,10 @@ bool isConfig1GPageSizeEnable()
     return CachingAllocatorConfig::page_size_1g_enable();
 }
 
+bool isConfig16GPageSizeEnable()
+{
+    return CachingAllocatorConfig::page_size_16g_enable();
+}
 // To prevent the deadlock situation, temporarily release the lock.
 //
 // Deadlock Scenario Description:
@@ -1230,12 +1403,19 @@ public:
             process_events(context);
         }
         auto size = round_size(orig_size);
+
+        // 如果开启了1g或者16G大页,则默认使用大内存池，进行分配内存，不再根据内存小，而优先选择小内存池的场景了
         auto &pool = get_pool(size, stream);
 
-        // 开环境变量 大池子放1G内存块
-        const size_t alloc_size = IsMallocPage1GMem(pool.is_small) ?
-            kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer) :
-            get_allocation_size(size);
+        // 开环境变量 进行16G和1G的内存对齐，如果没有配置就使用普通内存对齐get_allocation_size
+        size_t alloc_size = 0;
+        if (realUseAllocPageMem() == used1GPage) {
+            alloc_size = kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer);
+        } else if (realUseAllocPageMem() == used16GPage) {
+            alloc_size = kExtraLarge16GBuffer * ((size + kExtraLarge16GBuffer - 1) / kExtraLarge16GBuffer);
+        } else {
+            alloc_size = get_allocation_size(size);
+        }
         AllocParams params(device, size, stream, &pool, alloc_size, stats);
         params.stat_types = get_stat_types_for_pool(pool);
 
@@ -2132,14 +2312,19 @@ private:
                 return c;
             }
         }
+        size_t segment_block_count = used1GPage;
         auto segment_size = pool->is_small ?
             kSmallBuffer :
             (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer);
         // 此处申请虚拟内存，segment_size是页大小，实际虚拟内存巨大
-        if (IsMallocPage1GMem(pool->is_small)) {
+        if (realUseAllocPageMem() == used1GPage) {
             segment_size = kExtraLargeBuffer;
+            segment_block_count = used1GPage;
+        } else if (realUseAllocPageMem() == used16GPage) {
+            segment_size = kExtraLargeBuffer;
+            segment_block_count = used16GPage;
         }
-        auto segment = new ExpandableSegment(device, stream, segment_size);
+        auto segment = new ExpandableSegment(device, stream, segment_size, segment_block_count);
         if (hcclComm_) {
             segment->setHcclComm(hcclComm_);
         }
@@ -2158,6 +2343,7 @@ private:
         TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size, PTA_ERROR(ErrCode::INTERNAL));
         TORCH_INTERNAL_ASSERT(!to_map->context_when_allocated); // unmapped blocks should not keep
                                                                 // history
+        BlockPool &pool = *to_map->pool;
         auto mapped_range = to_map->expandable_segment_->map(SegmentRange{ to_map->ptr, size });
         // failed to map the memory
         if (mapped_range.size == 0) {
@@ -2166,7 +2352,6 @@ private:
         TORCH_INTERNAL_ASSERT(mapped_range.ptr == to_map->ptr && mapped_range.size >= size,
             PTA_ERROR(ErrCode::INTERNAL));
 
-        BlockPool &pool = *to_map->pool;
         pool.unmapped.erase(to_map);
         to_map->mapped = true;
 
@@ -2333,6 +2518,9 @@ private:
                 if (entry.second(stream)) {
                     auto it1 = graph_pools.find(entry.first);
                     TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
+                    if (realUseAllocPageMem() > 0) {
+                        return it1->second->large_blocks;
+                    }
                     if (size <= kSmallSize) {
                         return it1->second->small_blocks;
                     } else {
@@ -2341,6 +2529,9 @@ private:
                 }
             }
         }
+        if (realUseAllocPageMem() > 0) {
+            return large_blocks;
+        }
         if (size <= kSmallSize) {
             return small_blocks;
         } else {
@@ -2543,11 +2734,16 @@ private:
                 ptr = active_pool->allocator()->raw_alloc(size);
                 p.err = ptr ? ACL_ERROR_NONE : ACL_ERROR_RT_MEMORY_ALLOCATION;
             } else {
+                std::vector<size_t> alloc_size_level;
+                // 使用origin size进行1g对齐
+                alloc_size_level.push_back(kExtraLargeBuffer * ((p.size() + kExtraLargeBuffer - 1) / kExtraLargeBuffer));
                 auto policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST;
-                if (IsMallocPage1GMem(p.pool->is_small)) {
+                if (realUseAllocPageMem() == used1GPage) {
                     policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY;
+                } else if (realUseAllocPageMem() == used16GPage) {
+                    policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY;
                 }
-                p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy);
+                p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy, alloc_size_level, &segmentSizeUsed);
             }
             if (p.err != ACL_ERROR_NONE) {
                 return false;
@@ -3293,7 +3489,7 @@ public:
                 deleteFunc = &uncached_delete;
                 size_t alloc_size = size + 32;
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size,
-                    aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST));
+                    aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}, &segmentSizeUsed));
                 ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
                     "AclrtMallocAlign32: size=%zu",
                     alloc_size);
@@ -3322,7 +3518,8 @@ public:
                 deleteFunc = &uncached_delete;
                 size_t alloc_size = size + 32 + aligned;
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&realPtr, alloc_size,
-                                                                 aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST));
+                                                                 aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST,
+                                                                 {}, &segmentSizeUsed));
                 ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
                             "AclrtMallocAlign32: size=%zu", alloc_size);
             } else {
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index a4e14d2232..daf1f07ece 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -445,6 +445,8 @@ bool checkConfigExpandableSegments();
 
 bool isConfig1GPageSizeEnable();
 
+bool isConfig16GPageSizeEnable();
+
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
 
diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
index fa4e79ff7b..9d3f8497f0 100644
--- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
@@ -128,7 +128,7 @@ public:
                 PTA_ERROR(ErrCode::MEMORY));
 
             aclError err = c10_npu::acl::AclrtMallocAlign32(
-                &block->data_ptr, block->size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY);
+                &block->data_ptr, block->size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY, {}, nullptr);
             if (err != ACL_ERROR_NONE) {
                 return nullptr;
             }
@@ -510,7 +510,8 @@ public:
             if (size != 0) {
                 size_t alloc_size = size + 32;
                 NPU_CHECK_ERROR(
-                    c10_npu::acl::AclrtMallocAlign32(&dev_ptr, alloc_size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY));
+                    c10_npu::acl::AclrtMallocAlign32(&dev_ptr, alloc_size,
+                                                     aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY, {}, nullptr));
             }
         } else {
             if (size != 0) {
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index b59e9c85c9..aa81bdebfb 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -433,9 +433,10 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u
     return func(deviceId, utilizationInfo);
 }
 
-aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy) {
-    typedef aclError (*AclrtMallocAlign32)(void**, size_t, aclrtMemMallocPolicy);
-    static AclrtMallocAlign32 func = (AclrtMallocAlign32)GET_FUNC(aclrtMallocAlign32);
+aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy,
+                            std::vector <size_t> allocSizeLevel, uint32_t *segmentSizeUsed) {
+    typedef aclError (*AclrtMallocAlign32)(void **, size_t, aclrtMemMallocPolicy);
+    static AclrtMallocAlign32 func = (AclrtMallocAlign32) GET_FUNC(aclrtMallocAlign32);
     aclError ret;
     if (func != nullptr) {
         ret = func(devPtr, size, policy);
@@ -444,13 +445,35 @@ aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy pol
         ret = aclrtMalloc(devPtr, size, policy);
     }
 
-    if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY)) {
+    if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY)) {
+        TORCH_NPU_WARN_ONCE("TThe malloc 16G large-page physical memory failed, "
+                            "so try to malloc 1G large-page physical memory."
+                            "Using the 1G large-page physical memory page may result in performance degradation. "
+                            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration "
+                            "is enabled, but the pre-allocated number of 16G large pages is insufficient "
+                            "or 16G large-page memory pre-allocation is not enabled.");
+        policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY;
+        // 进行size对齐的调整
+        size = allocSizeLevel[0];
+        // 16g失败,标志位设置成1
+        *segmentSizeUsed = 1;
+        if (func != nullptr) {
+            ret = func(devPtr, size, policy);
+        } else {
+            TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtMallocAlign32");
+            ret = aclrtMalloc(devPtr, size, policy);
+        }
+    };
+    if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY ||
+                                  policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY)) {
         TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory."
                             "Using the 2M memory page may result in performance degradation. "
                             "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is "
                             "enabled, but the pre-allocated number of 1G large pages is insufficient or 1G large-page "
                             "memory pre-allocation is not enabled.");
         policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST;
+        size = allocSizeLevel[0];
+        *segmentSizeUsed = 1;
         if (func != nullptr) {
             ret = func(devPtr, size, policy);
         } else {
@@ -521,16 +544,34 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm)
     return ret;
 }
 
-aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop,
-    uint64_t flags) {
-    typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle*, size_t, const aclrtPhysicalMemProp*, uint64_t);
+aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop,
+                             uint64_t flags, std::vector <size_t> segmentSizeLevel, uint32_t *segmentSizeUsed,
+                             bool resetSegmentSizeFlags) {
+    typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle *, size_t, const aclrtPhysicalMemProp *, uint64_t);
     static AclrtMallocPhysical func = nullptr;
     if (func == nullptr) {
-        func = (AclrtMallocPhysical)GET_FUNC(aclrtMallocPhysical);
+        func = (AclrtMallocPhysical) GET_FUNC(aclrtMallocPhysical);
     }
     TORCH_CHECK(func, "Failed to find function ", "aclrtMallocPhysical", PTA_ERROR(ErrCode::NOT_FOUND));
     aclError ret = func(handle, size, prop, flags);
-    if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G)) {
+    if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE16G)) {
+        TORCH_NPU_WARN_ONCE("The malloc 16G large-page physical memory failed, "
+                            "so try to malloc 1G large-page physical memory."
+                            "Using the 1G large-page physical memory page may result in performance degradation. "
+                            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration "
+                            "is enabled, but the pre-allocated number of 16G large pages is insufficient "
+                            "or 16G large-page memory pre-allocation is not enabled.");
+        aclrtPhysicalMemProp prop_update = {prop->handleType,
+                                            prop->allocationType,
+                                            ACL_HBM_MEM_HUGE1G,
+                                            {prop->location.id,
+                                             prop->location.type},
+                                            prop->reserve};
+        size = segmentSizeLevel[0];
+        *segmentSizeUsed = 1;
+        ret = func(handle, size, &prop_update, flags);
+    };
+    if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G || prop->memAttr == ACL_HBM_MEM_HUGE16G)) {
         TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory."
                             "Using the 2M memory page may result in performance degradation. "
                             "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration "
@@ -542,6 +583,7 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrt
                                             {prop->location.id,
                                              prop->location.type},
                                             prop->reserve};
+        size = segmentSizeLevel[0];
         ret = func(handle, size, &prop_update, flags);
     }
     return ret;
@@ -819,7 +861,7 @@ aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *ve
     if (func == nullptr) {
         func = (aclsysGetCANNVersionFunc)GET_FUNC(aclsysGetCANNVersion);
         if (func == nullptr) {
-        return ACL_ERROR_RT_FEATURE_NOT_SUPPORT;
+            return ACL_ERROR_RT_FEATURE_NOT_SUPPORT;
         }
     }
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index 9cdad2663b..7121fb1125 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -137,7 +137,8 @@ aclError AclrtDestroyStreamForce(aclrtStream stream);
 
 aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *utilizationInfo);
 
-aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy);
+aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy,
+                            std::vector<size_t> allocSizeLevel, uint32_t *segmentSizeUsed);
 
 aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status);
 
@@ -148,7 +149,9 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo
 
 aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm = nullptr);
 
-aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop, uint64_t flags);
+aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, uint64_t flags,
+                             std::vector <size_t> segmentSizeLevel, uint32_t *segmentSizeUsed,
+                             bool resetSegmentSizeFlags);
 
 aclError AclrtFreePhysical(aclrtDrvMemHandle handle);
 
diff --git a/torch_npu/csrc/npu/Stress_detect.cpp b/torch_npu/csrc/npu/Stress_detect.cpp
index 3fcade819b..63301c5bc6 100644
--- a/torch_npu/csrc/npu/Stress_detect.cpp
+++ b/torch_npu/csrc/npu/Stress_detect.cpp
@@ -109,10 +109,12 @@ int StressDetector::perform_stress_detect(int deviceid)
     uint64_t size = 10;
     workspaceSize = size << 10 << 10 << 10;  // Assume memory size
     if (workspaceSize > 0) {
-        auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+        auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST, {},
+                                                    nullptr);
         if (ret != ACL_ERROR_NONE) {
             c10_npu::NPUCachingAllocator::emptyCache();
-            ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+            ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST, {},
+                                                   nullptr);
             if (ret != ACL_ERROR_NONE) {
                 ASCEND_LOGW("call AclrtMallocAlign32 failed, ERROR : %d. Skip StressDetect.", ret);
                 task_in_progress.store(false); // Task ends
-- 
Gitee


From 8e86e1f1dacbd9f0e952fd482da0901747cb0133 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Tue, 8 Jul 2025 19:17:02 +0800
Subject: [PATCH 2/8] c

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp   | 15 ++++++++-------
 .../csrc/core/npu/interface/AclInterface.cpp      |  1 +
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 70245b856c..cb600819ec 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -627,16 +627,17 @@ private:
             // i=headid,&&segment_block_count+i<=end===>释放这个handle
             size_t segmentBlockCount=segmentHandles.segment_block_count;
             size_t segmentBlockHead=segmentHandles.segment_block_head;
-            ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i,begin,end);
-            if(i==segmentBlockHead&&segmentBlockCount+i<=end){
-                if(beginUnmap==0){
-                    beginUnmap=segmentBlockHead;
+            ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,begin=%zu,end=%zu", i,begin,end);
+            if (i == segmentBlockHead && segmentBlockCount + i <= end) {
+                if (beginUnmap == 0) {
+                    beginUnmap = segmentBlockHead;
                 }
-                endUnmap=segmentBlockHead+segmentBlockCount;
+                endUnmap = segmentBlockHead + segmentBlockCount;
                 aclrtDrvMemHandle h = segmentHandles.drvMem_Handle;
-                NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm()));
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *) ptr_ + segment_size_ * i, getHcclComm()));
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
-                ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i,segment_size_,begin,end);
+                ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i, segment_size_,
+                            begin, end);
                 handles_.at(i) = c10::nullopt;
             }
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index aa81bdebfb..48fbe75f95 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -584,6 +584,7 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclr
                                              prop->location.type},
                                             prop->reserve};
         size = segmentSizeLevel[0];
+        *segmentSizeUsed = 1;
         ret = func(handle, size, &prop_update, flags);
     }
     return ret;
-- 
Gitee


From 923037123131c8eae640274f2eea32a25e26cab6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Tue, 8 Jul 2025 20:52:32 +0800
Subject: [PATCH 3/8] cc

---
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 76 ++++++++-----------
 .../csrc/core/npu/interface/AclInterface.cpp  |  3 +-
 2 files changed, 32 insertions(+), 47 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index cb600819ec..715109663d 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -475,16 +475,10 @@ struct ExpandableSegment {
         }
         size_t segment_len = end - begin;
         long remain_size = range.size;
-        ASCEND_LOGD("NPUCachingAllocator map:begin=%zu, end=%zu, remain_size=%zu",begin,end, remain_size);
         auto current_segment_size = segment_size_*segment_block_count_;
         size_t realEnd = end;
         for (auto i : c10::irange(begin, end)) {
-            //伪装降级
-//            if(segmentSizeUsed=16&&i=!0){
-//                current_segment_size=kExtraLargeBuffer;
-//                segmentSizeUsed=1;
-//            }
-            // TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL));
+//             TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL));
             SegmentHandles segmentHandles;
             aclrtDrvMemHandle handle = nullptr;
             aclrtPhysicalMemProp prop = {};
@@ -500,14 +494,12 @@ struct ExpandableSegment {
             prop.location.id = static_cast<unsigned>(device_);
             prop.reserve = 0;
             // 如果是16g的头部块，则进行物理内存的申请。或者如果失败了，则降级到1G
-            size_t headId = getSegmentBlockHead(begin,end,i);
-            ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remain_size=%zu",headId,i, current_segment_size,remain_size);
+            size_t headId = getSegmentBlockHead(begin, end, i);
+            ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remain_size=%zu", headId,
+                        i, current_segment_size, remain_size);
             if (headId == i && remain_size >= 0) {
-                std::vector <size_t> segment_size_level;
-                segment_size_level.push_back(kExtraLargeBuffer);
                 auto status = c10_npu::acl::AclrtMallocPhysical(&handle, current_segment_size, &prop, 0,
-                                                                segment_size_level,
-                                                                &segmentSizeUsed, 0);
+                                                                {kExtraLargeBuffer}, &segmentSizeUsed);
                 if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
                     for (auto j: c10::irange(begin, i)) {
                         auto h = handles_.at(j).value().drvMem_Handle;
@@ -521,37 +513,35 @@ struct ExpandableSegment {
                 remain_size = remain_size - current_segment_size;
                 usedSegmentSizes_.push_back(current_segment_size);
                 segmentHandles.segment_block_head = headId;
-                segmentHandles.drvMem_Handle=handle;
-                segmentHandles.segment_size=segment_size_;
-                segmentHandles.segment_block_count=segmentSizeUsed;
+                segmentHandles.drvMem_Handle = handle;
+                segmentHandles.segment_size = segment_size_;
+                segmentHandles.segment_block_count = segmentSizeUsed;
+                segment_block_count_ = segmentSizeUsed;
                 handles_.at(i) = segmentHandles;
             }
 
-            if(remain_size<0){
-                realEnd=i;
+            if (remain_size < 0) {
+                realEnd = i;
                 break;
             }
         }
         auto start_ptr = range.ptr;
         size_t total_size = 0;
         for (auto i: c10::irange(begin, end)) {
-            if(i>realEnd){
+            if (i > realEnd || handles_.at(i) == c10::nullopt) {
                 continue;
             }
-            SegmentHandles segmentHandles=handles_.at(i).value();
-            ASCEND_LOGD("NPUCachingAllocator map:i=%zu, segmentHandles.segment_size=%zu, segmentHandles.drvMem_Handle=%zu",i,
-                        segmentHandles.segment_size*segmentHandles.segment_block_count,segmentHandles.drvMem_Handle==nullptr);
-            if(segmentHandles.drvMem_Handle!=nullptr){
-                size_t usedSegmentSizes= segmentHandles.segment_size*segmentHandles.segment_block_count;
+            SegmentHandles segmentHandles = handles_.at(i).value();
+            if (segmentHandles.drvMem_Handle != nullptr) {
+                size_t usedSegmentSizes = segmentHandles.segment_size * segmentHandles.segment_block_count;
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0,
                                                           segmentHandles.drvMem_Handle, 0, getHcclComm()));
-                start_ptr=start_ptr+usedSegmentSizes;
-                total_size = total_size+usedSegmentSizes;
-                ASCEND_LOGD("NPUCachingAllocator map:i=%zu, segment_size=%zu,total_size=%zu",i, usedSegmentSizes_[i],total_size);
+                start_ptr = start_ptr + usedSegmentSizes;
+                total_size = total_size + usedSegmentSizes;
+                ASCEND_LOGD("NPUCachingAllocator map:i=%zu", i);
             }
         }
-        //return rangeFromHandles(begin, end);
-        ASCEND_LOGD("NPUCachingAllocator ================>total_size=%zu", total_size);
+        ASCEND_LOGD("NPUCachingAllocator map total_size=%zu", total_size);
         return SegmentRange(range.ptr, total_size);
     }
 
@@ -562,12 +552,11 @@ struct ExpandableSegment {
     {
         auto begin = unmapSegmentRight(range.ptr);
         auto end = segmentLeft(range.ptr + range.size);
-        ASCEND_LOGD("NPUCachingAllocator =unmap===============>begin=%zu,end=%zu,size=%zu", begin,end,range.size);
+        ASCEND_LOGD("NPUCachingAllocator start to unmap,begin=%zu,end=%zu,size=%zu", begin, end, range.size);
         if (begin >= end) {
             return SegmentRange{ range.ptr, 0 };
         }
         return unmapHandles(begin, end);
-//        return rangeFromHandles(begin, end);
     }
 
     char *ptr() const
@@ -616,18 +605,15 @@ private:
             trigger->traceNpuStreamSynchronization(reinterpret_cast<uintptr_t>(stream_));
         }
 #endif
-        size_t beginUnmap=0;
-        size_t endUnmap=0;
-        for (auto i : c10::irange(begin, end)) {
-            ASCEND_LOGD("NPUCachingAllocator unmap: handles_.at(i)=%zu", handles_.at(i)==c10::nullopt);
+        size_t beginUnmap = 0;
+        size_t endUnmap = 0;
+        for (auto i: c10::irange(begin, end)) {
             if (handles_.at(i) == c10::nullopt) {
                 continue;
             }
-            SegmentHandles segmentHandles=handles_.at(i).value();
-            // i=headid,&&segment_block_count+i<=end===>释放这个handle
-            size_t segmentBlockCount=segmentHandles.segment_block_count;
-            size_t segmentBlockHead=segmentHandles.segment_block_head;
-            ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,begin=%zu,end=%zu", i,begin,end);
+            SegmentHandles segmentHandles = handles_.at(i).value();
+            size_t segmentBlockCount = segmentHandles.segment_block_count;
+            size_t segmentBlockHead = segmentHandles.segment_block_head;
             if (i == segmentBlockHead && segmentBlockCount + i <= end) {
                 if (beginUnmap == 0) {
                     beginUnmap = segmentBlockHead;
@@ -636,8 +622,8 @@ private:
                 aclrtDrvMemHandle h = segmentHandles.drvMem_Handle;
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *) ptr_ + segment_size_ * i, getHcclComm()));
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
-                ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i, segment_size_,
-                            begin, end);
+                ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,begin=%zu,end=%zu", i, segmentBlockHead,
+                            segmentBlockCount + segmentBlockHead);
                 handles_.at(i) = c10::nullopt;
             }
 
@@ -647,9 +633,9 @@ private:
             beginUnmap = begin;
             endUnmap = begin;
         }
-        ASCEND_LOGD("NPUCachingAllocator unmapHandles: beginUnmap=%zu,endUnmap=%zu", beginUnmap,endUnmap);
+        ASCEND_LOGD("NPUCachingAllocator total unmapHandles: beginUnmap=%zu,endUnmap=%zu,segment_size_=%zu", beginUnmap,
+                    endUnmap, segment_size_);
         return rangeFromHandles(beginUnmap, endUnmap);
-        //return SegmentRange(beginUnmap, endUnmap);
     }
 
     void trimHandles()
@@ -697,7 +683,7 @@ private:
     }
 
     size_t getSegmentBlockHead(size_t begin, size_t end, size_t index) {
-        return ((index - begin) / segmentSizeUsed) * segmentSizeUsed + begin;
+        return ((index - begin) / segment_block_count_) * segment_block_count_ + begin;
     }
 
     SegmentRange rangeFromHandles(size_t begin, size_t end) {
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index 48fbe75f95..b3904d43c6 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -545,8 +545,7 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm)
 }
 
 aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop,
-                             uint64_t flags, std::vector <size_t> segmentSizeLevel, uint32_t *segmentSizeUsed,
-                             bool resetSegmentSizeFlags) {
+                             uint64_t flags, std::vector <size_t> segmentSizeLevel, uint32_t *segmentSizeUsed) {
     typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle *, size_t, const aclrtPhysicalMemProp *, uint64_t);
     static AclrtMallocPhysical func = nullptr;
     if (func == nullptr) {
-- 
Gitee


From 52d3ee2d34f47f1ad71f5ae3c3931ffb7446c60b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Wed, 9 Jul 2025 09:10:22 +0800
Subject: [PATCH 4/8] CC

---
 torch_npu/csrc/core/npu/interface/AclInterface.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index 7121fb1125..c99d69e467 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -150,8 +150,7 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo
 aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm = nullptr);
 
 aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, uint64_t flags,
-                             std::vector <size_t> segmentSizeLevel, uint32_t *segmentSizeUsed,
-                             bool resetSegmentSizeFlags);
+                             std::vector <size_t> segmentSizeLevel, uint32_t *segmentSizeUsed);
 
 aclError AclrtFreePhysical(aclrtDrvMemHandle handle);
 
-- 
Gitee


From 2fb23b1c153dba2cdf7d5120f2535fba9a2e2108 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Wed, 9 Jul 2025 10:27:45 +0800
Subject: [PATCH 5/8] cc

---
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 26 +++++++++----------
 .../csrc/core/npu/interface/AclInterface.cpp  | 14 +++++-----
 .../csrc/core/npu/interface/AclInterface.h    |  4 +--
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 715109663d..2d789e59a7 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -104,7 +104,7 @@ const std::string kMinDriverVersionSupport16G = "25.1.RC1";     // minimum drive
 const std::string kCannModule = "CANN";               // cann module name
 constexpr int used1GPage = 1;               // used 1g to alloc
 constexpr int used16GPage = 16;               // used 16g to alloc
-static uint32_t segmentSizeUsed = 1;  // segment size used
+static uint32_t segmentUsedPageType = 1;  // segment size used
 
 using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
 
@@ -204,7 +204,7 @@ int realUseAllocPageMem(){
     if(c10_npu::NPUCachingAllocator::isConfig16GPageSizeEnable()){
 
         // 满足16g版本并且有足够的16g大页块申请
-        if (IsSupport16GVersion() && segmentSizeUsed == used16GPage) {
+        if (IsSupport16GVersion() && segmentUsedPageType == used16GPage) {
             return used16GPage;
         };
         if(IsSupport1GVersion()){
@@ -216,7 +216,7 @@ int realUseAllocPageMem(){
                 "The minimum version of the HDK (driver) needs to be greater than 25.1.RC1 "
                 "and The minimum version of the CANN needs to be greater than 8.2.RC1."
                 "Or there may not be enough 16g virtual memory blocks allocated.");
-            segmentSizeUsed=used1GPage;
+            segmentUsedPageType = used1GPage;
             return used1GPage;
         };
     }
@@ -499,7 +499,7 @@ struct ExpandableSegment {
                         i, current_segment_size, remain_size);
             if (headId == i && remain_size >= 0) {
                 auto status = c10_npu::acl::AclrtMallocPhysical(&handle, current_segment_size, &prop, 0,
-                                                                {kExtraLargeBuffer}, &segmentSizeUsed);
+                                                                {kExtraLargeBuffer}, &segmentUsedPageType);
                 if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
                     for (auto j: c10::irange(begin, i)) {
                         auto h = handles_.at(j).value().drvMem_Handle;
@@ -515,8 +515,8 @@ struct ExpandableSegment {
                 segmentHandles.segment_block_head = headId;
                 segmentHandles.drvMem_Handle = handle;
                 segmentHandles.segment_size = segment_size_;
-                segmentHandles.segment_block_count = segmentSizeUsed;
-                segment_block_count_ = segmentSizeUsed;
+                segmentHandles.segment_block_count = segmentUsedPageType;
+                segment_block_count_ = segmentUsedPageType;
                 handles_.at(i) = segmentHandles;
             }
 
@@ -538,7 +538,6 @@ struct ExpandableSegment {
                                                           segmentHandles.drvMem_Handle, 0, getHcclComm()));
                 start_ptr = start_ptr + usedSegmentSizes;
                 total_size = total_size + usedSegmentSizes;
-                ASCEND_LOGD("NPUCachingAllocator map:i=%zu", i);
             }
         }
         ASCEND_LOGD("NPUCachingAllocator map total_size=%zu", total_size);
@@ -1078,7 +1077,7 @@ size_t CachingAllocatorConfig::parsePageSize(const std::vector<std::string> &con
         m_page_size_1g = true;
     } else if(config[i + 2] == "16g"){
         m_page_size_16g = true;
-        segmentSizeUsed=used16GPage;
+        segmentUsedPageType=used16GPage;
     }else {
         TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE));
     }
@@ -2721,16 +2720,17 @@ private:
                 ptr = active_pool->allocator()->raw_alloc(size);
                 p.err = ptr ? ACL_ERROR_NONE : ACL_ERROR_RT_MEMORY_ALLOCATION;
             } else {
-                std::vector<size_t> alloc_size_level;
+                std::vector <size_t> alloc_size_level;
                 // 使用origin size进行1g对齐
-                alloc_size_level.push_back(kExtraLargeBuffer * ((p.size() + kExtraLargeBuffer - 1) / kExtraLargeBuffer));
+                alloc_size_level.push_back(
+                    kExtraLargeBuffer * ((p.size() + kExtraLargeBuffer - 1) / kExtraLargeBuffer));
                 auto policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST;
                 if (realUseAllocPageMem() == used1GPage) {
                     policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY;
                 } else if (realUseAllocPageMem() == used16GPage) {
                     policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY;
                 }
-                p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy, alloc_size_level, &segmentSizeUsed);
+                p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy, alloc_size_level, &segmentUsedPageType);
             }
             if (p.err != ACL_ERROR_NONE) {
                 return false;
@@ -3476,7 +3476,7 @@ public:
                 deleteFunc = &uncached_delete;
                 size_t alloc_size = size + 32;
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size,
-                    aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}, &segmentSizeUsed));
+                    aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}, &segmentUsedPageType));
                 ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
                     "AclrtMallocAlign32: size=%zu",
                     alloc_size);
@@ -3506,7 +3506,7 @@ public:
                 size_t alloc_size = size + 32 + aligned;
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&realPtr, alloc_size,
                                                                  aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST,
-                                                                 {}, &segmentSizeUsed));
+                                                                 {}, &segmentUsedPageType));
                 ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
                             "AclrtMallocAlign32: size=%zu", alloc_size);
             } else {
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index b3904d43c6..33e6541580 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -434,7 +434,7 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u
 }
 
 aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy,
-                            std::vector <size_t> allocSizeLevel, uint32_t *segmentSizeUsed) {
+                            std::vector <size_t> allocSizeLevel, uint32_t *segmentUsedPageType) {
     typedef aclError (*AclrtMallocAlign32)(void **, size_t, aclrtMemMallocPolicy);
     static AclrtMallocAlign32 func = (AclrtMallocAlign32) GET_FUNC(aclrtMallocAlign32);
     aclError ret;
@@ -456,7 +456,7 @@ aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy po
         // 进行size对齐的调整
         size = allocSizeLevel[0];
         // 16g失败,标志位设置成1
-        *segmentSizeUsed = 1;
+        *segmentUsedPageType = 1;
         if (func != nullptr) {
             ret = func(devPtr, size, policy);
         } else {
@@ -473,7 +473,7 @@ aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy po
                             "memory pre-allocation is not enabled.");
         policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST;
         size = allocSizeLevel[0];
-        *segmentSizeUsed = 1;
+        *segmentUsedPageType = 1;
         if (func != nullptr) {
             ret = func(devPtr, size, policy);
         } else {
@@ -545,11 +545,11 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm)
 }
 
 aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop,
-                             uint64_t flags, std::vector <size_t> segmentSizeLevel, uint32_t *segmentSizeUsed) {
+                             uint64_t flags, std::vector <size_t> segmentSizeLevel, uint32_t *segmentUsedPageType) {
     typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle *, size_t, const aclrtPhysicalMemProp *, uint64_t);
     static AclrtMallocPhysical func = nullptr;
     if (func == nullptr) {
-        func = (AclrtMallocPhysical) GET_FUNC(aclrtMallocPhysical);
+        func = (AclrtMallocPhysical)GET_FUNC(aclrtMallocPhysical);
     }
     TORCH_CHECK(func, "Failed to find function ", "aclrtMallocPhysical", PTA_ERROR(ErrCode::NOT_FOUND));
     aclError ret = func(handle, size, prop, flags);
@@ -567,7 +567,7 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclr
                                              prop->location.type},
                                             prop->reserve};
         size = segmentSizeLevel[0];
-        *segmentSizeUsed = 1;
+        *segmentUsedPageType = 1;
         ret = func(handle, size, &prop_update, flags);
     };
     if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G || prop->memAttr == ACL_HBM_MEM_HUGE16G)) {
@@ -583,7 +583,7 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclr
                                              prop->location.type},
                                             prop->reserve};
         size = segmentSizeLevel[0];
-        *segmentSizeUsed = 1;
+        *segmentUsedPageType = 1;
         ret = func(handle, size, &prop_update, flags);
     }
     return ret;
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index c99d69e467..3eae5bb3f0 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -138,7 +138,7 @@ aclError AclrtDestroyStreamForce(aclrtStream stream);
 aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *utilizationInfo);
 
 aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy,
-                            std::vector<size_t> allocSizeLevel, uint32_t *segmentSizeUsed);
+                            std::vector<size_t> allocSizeLevel, uint32_t *segmentUsedPageType);
 
 aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status);
 
@@ -150,7 +150,7 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo
 aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm = nullptr);
 
 aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, uint64_t flags,
-                             std::vector <size_t> segmentSizeLevel, uint32_t *segmentSizeUsed);
+                             std::vector <size_t> segmentSizeLevel, uint32_t *segmentUsedPageType);
 
 aclError AclrtFreePhysical(aclrtDrvMemHandle handle);
 
-- 
Gitee


From f793cd3c9fc5d1cc84781796b61aa76171ab092b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Wed, 9 Jul 2025 11:28:22 +0800
Subject: [PATCH 6/8] dd

---
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 57 +++++++++----------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 2d789e59a7..74a3a4ea60 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -228,7 +228,6 @@ int realUseAllocPageMem(){
     return 0;
 }
 
-
 struct Block;
 struct PrivatePool;
 using Comparison = bool (*)(const Block *, const Block *);
@@ -421,13 +420,13 @@ bevhavior for allocator tensors that need to be used cross-process.
 */
 
 struct ExpandableSegment {
-    ExpandableSegment(int device, aclrtStream stream, size_t size,size_t blockCout)
+    ExpandableSegment(int device, aclrtStream stream, size_t size,size_t blockCount)
         : device_(device),
           stream_(stream),
           max_handles_(0),
           // 2MB for small pool, 20MB for large pool
           segment_size_(size),
-          segment_block_count_(blockCout)
+          segment_block_count_(blockCount)
     {
         size_t device_free;
         size_t device_total;
@@ -447,11 +446,11 @@ struct ExpandableSegment {
                 max_handles_ = numSegments(kLargePoolVirAddrSize);
             }
         }
-
-        NPU_CHECK_ERROR(
-            c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_block_count_*segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm()));
-        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, max_handles_=%zu,segment_size=%zu,segment_block_count_=%zu",
-                    segment_block_count_*segment_size_ * max_handles_, max_handles_,segment_size_,segment_block_count_);
+        size_t totalReserveMemSize = segment_block_count_ * segment_size_ * max_handles_;
+        NPU_CHECK_ERROR(c10_npu::acl::AclrtReserveMemAddress(&ptr_, totalReserveMemSize, 0, nullptr, 1, getHcclComm()));
+        ASCEND_LOGD(
+            "NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu,segment_size=%zu,segment_block_count_=%zu",
+            totalReserveMemSize, segment_size_, segment_block_count_);
     }
     // begin must be aligned to segment_size_.
     // returns the actual range mapped, which may be
@@ -459,10 +458,9 @@ struct ExpandableSegment {
     // return size of 0 indicates OOM
     SegmentRange map(SegmentRange range)
     {
+        // 新的内存申请时，使用降级后的标记
         if (realUseAllocPageMem() == used1GPage) {
             segment_block_count_ = used1GPage;
-        } else if (realUseAllocPageMem() == used16GPage) {
-            segment_block_count_ = used16GPage;
         }
         auto begin = segmentLeft(range.ptr);
         auto end = segmentRight(range.ptr + range.size);
@@ -473,9 +471,8 @@ struct ExpandableSegment {
         while (end > handles_.size()) {
             handles_.emplace_back(c10::nullopt);
         }
-        size_t segment_len = end - begin;
-        long remain_size = range.size;
-        auto current_segment_size = segment_size_*segment_block_count_;
+        long remainSize = range.size;
+        auto needMallocSegmentSize = segment_size_ * segment_block_count_;
         size_t realEnd = end;
         for (auto i : c10::irange(begin, end)) {
 //             TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL));
@@ -495,13 +492,16 @@ struct ExpandableSegment {
             prop.reserve = 0;
             // 如果是16g的头部块，则进行物理内存的申请。或者如果失败了，则降级到1G
             size_t headId = getSegmentBlockHead(begin, end, i);
-            ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remain_size=%zu", headId,
-                        i, current_segment_size, remain_size);
-            if (headId == i && remain_size >= 0) {
-                auto status = c10_npu::acl::AclrtMallocPhysical(&handle, current_segment_size, &prop, 0,
+            ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remainSize=%zu", headId,
+                        i, needMallocSegmentSize, remainSize);
+            if (headId == i && remainSize >= 0) {
+                auto status = c10_npu::acl::AclrtMallocPhysical(&handle, needMallocSegmentSize, &prop, 0,
                                                                 {kExtraLargeBuffer}, &segmentUsedPageType);
                 if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
                     for (auto j: c10::irange(begin, i)) {
+                        if (handles_.at(i) == c10::nullopt) {
+                            continue;
+                        }
                         auto h = handles_.at(j).value().drvMem_Handle;
                         handles_.at(j) = c10::nullopt;
                         NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
@@ -510,8 +510,8 @@ struct ExpandableSegment {
                     return rangeFromHandles(begin, begin);
                 }
                 NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
-                remain_size = remain_size - current_segment_size;
-                usedSegmentSizes_.push_back(current_segment_size);
+                remainSize = remainSize - needMallocSegmentSize;
+                usedSegmentSizes_.push_back(needMallocSegmentSize);
                 segmentHandles.segment_block_head = headId;
                 segmentHandles.drvMem_Handle = handle;
                 segmentHandles.segment_size = segment_size_;
@@ -520,7 +520,7 @@ struct ExpandableSegment {
                 handles_.at(i) = segmentHandles;
             }
 
-            if (remain_size < 0) {
+            if (remainSize <= 0) {
                 realEnd = i;
                 break;
             }
@@ -532,15 +532,13 @@ struct ExpandableSegment {
                 continue;
             }
             SegmentHandles segmentHandles = handles_.at(i).value();
-            if (segmentHandles.drvMem_Handle != nullptr) {
-                size_t usedSegmentSizes = segmentHandles.segment_size * segmentHandles.segment_block_count;
-                NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0,
-                                                          segmentHandles.drvMem_Handle, 0, getHcclComm()));
-                start_ptr = start_ptr + usedSegmentSizes;
-                total_size = total_size + usedSegmentSizes;
-            }
+            size_t usedSegmentSizes = segmentHandles.segment_size * segmentHandles.segment_block_count;
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0, segmentHandles.drvMem_Handle, 0,
+                                                      getHcclComm()));
+            start_ptr = start_ptr + usedSegmentSizes;
+            total_size = total_size + usedSegmentSizes;
         }
-        ASCEND_LOGD("NPUCachingAllocator map total_size=%zu", total_size);
+        ASCEND_LOGD("NPUCachingAllocator map: total_size=%zu", total_size);
         return SegmentRange(range.ptr, total_size);
     }
 
@@ -2298,11 +2296,10 @@ private:
                 return c;
             }
         }
-        size_t segment_block_count = used1GPage;
+        size_t segment_block_count = 1;
         auto segment_size = pool->is_small ?
             kSmallBuffer :
             (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer);
-        // 此处申请虚拟内存，segment_size是页大小，实际虚拟内存巨大
         if (realUseAllocPageMem() == used1GPage) {
             segment_size = kExtraLargeBuffer;
             segment_block_count = used1GPage;
-- 
Gitee


From 4e894de399a0159e32717067a2f32a780c2e1e73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Wed, 9 Jul 2025 14:32:15 +0800
Subject: [PATCH 7/8] c

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 74a3a4ea60..64534743a5 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -619,8 +619,8 @@ private:
                 aclrtDrvMemHandle h = segmentHandles.drvMem_Handle;
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *) ptr_ + segment_size_ * i, getHcclComm()));
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
-                ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,begin=%zu,end=%zu", i, segmentBlockHead,
-                            segmentBlockCount + segmentBlockHead);
+                ASCEND_LOGD("NPUCachingAllocator unmapHandles: i=%zu,begin=%zu,end=%zu", i, segmentBlockHead,
+                            segmentBlockHead + segmentBlockCount);
                 handles_.at(i) = c10::nullopt;
             }
 
@@ -657,7 +657,6 @@ private:
 
     size_t numSegments(size_t size)
     {
-        ASCEND_LOGD("numSegments unmap: size=%zu,segment_size_=%zu,segment_block_count_=%zu,end=%zu", size,segment_size_,segment_block_count_);
         return (size + segment_size_*segment_block_count_ - 1) / (segment_size_*segment_block_count_);
     }
 
@@ -670,7 +669,7 @@ private:
     size_t segmentRight(char *p)
     {
         auto size = p - ptr();
-        return numSegments(size)*segment_block_count_;
+        return numSegments(size) * segment_block_count_;
     }
 
     size_t unmapSegmentRight(char *p)
@@ -1075,7 +1074,7 @@ size_t CachingAllocatorConfig::parsePageSize(const std::vector<std::string> &con
         m_page_size_1g = true;
     } else if(config[i + 2] == "16g"){
         m_page_size_16g = true;
-        segmentUsedPageType=used16GPage;
+        segmentUsedPageType = used16GPage;
     }else {
         TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE));
     }
@@ -2326,7 +2325,6 @@ private:
         TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size, PTA_ERROR(ErrCode::INTERNAL));
         TORCH_INTERNAL_ASSERT(!to_map->context_when_allocated); // unmapped blocks should not keep
                                                                 // history
-        BlockPool &pool = *to_map->pool;
         auto mapped_range = to_map->expandable_segment_->map(SegmentRange{ to_map->ptr, size });
         // failed to map the memory
         if (mapped_range.size == 0) {
@@ -2335,6 +2333,7 @@ private:
         TORCH_INTERNAL_ASSERT(mapped_range.ptr == to_map->ptr && mapped_range.size >= size,
             PTA_ERROR(ErrCode::INTERNAL));
 
+        BlockPool &pool = *to_map->pool;
         pool.unmapped.erase(to_map);
         to_map->mapped = true;
 
-- 
Gitee


From 0efeb9eb8fd2d3adc6a96da8d873f23673ea73d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= <guanlongfeng1@huawei.com>
Date: Wed, 9 Jul 2025 14:59:18 +0800
Subject: [PATCH 8/8] c

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 64534743a5..11a8eab6c9 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -420,7 +420,11 @@ bevhavior for allocator tensors that need to be used cross-process.
 */
 
 struct ExpandableSegment {
-    ExpandableSegment(int device, aclrtStream stream, size_t size,size_t blockCount)
+    ExpandableSegment(
+        int device,
+        std::optional<aclrtStream> stream,
+        size_t size,
+        size_t blockCount)
         : device_(device),
           stream_(stream),
           max_handles_(0),
-- 
Gitee