From 935e03829be7f5a51c8aad3adb908187cbe8e5df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Thu, 12 Jun 2025 15:50:37 +0800 Subject: [PATCH 1/8] add config PYTORCH_NPU_ALLOC_CONF = page_size:16g --- .../pluggable_allocator_extensions.cpp | 2 +- third_party/acl/inc/acl/acl_rt.h | 2 + .../csrc/core/npu/NPUCachingAllocator.cpp | 311 ++++++++++++++---- torch_npu/csrc/core/npu/NPUCachingAllocator.h | 2 + .../csrc/core/npu/NPUWorkspaceAllocator.cpp | 5 +- .../csrc/core/npu/interface/AclInterface.cpp | 62 +++- .../csrc/core/npu/interface/AclInterface.h | 7 +- torch_npu/csrc/npu/Stress_detect.cpp | 6 +- 8 files changed, 323 insertions(+), 74 deletions(-) diff --git a/test/cpp_extensions/pluggable_allocator_extensions.cpp b/test/cpp_extensions/pluggable_allocator_extensions.cpp index c17077116b..4bf1ff1099 100644 --- a/test/cpp_extensions/pluggable_allocator_extensions.cpp +++ b/test/cpp_extensions/pluggable_allocator_extensions.cpp @@ -14,7 +14,7 @@ static bool useflag = false; void* my_malloc(ssize_t size, int device, aclrtStream stream) { void *ptr; - aclrtMallocAlign32(&ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMallocAlign32(&ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}); std::cout<<"alloc ptr = "<(StatType::NUM_TYPES)>; @@ -140,36 +146,89 @@ void update_stat_array(StatArray &stat_array, int64_t amount, const StatTypes &s [&stat_array, amount](size_t stat_type) { update_stat(stat_array[stat_type], amount); }); } -bool IsMallocPage1GMem(bool is_small_pool) + + +bool IsSupport1GVersion() { static bool is_support_page_size_1g = []() { - if (!c10_npu::NPUCachingAllocator::isConfig1GPageSizeEnable()) { + if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) { + TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. " + "Using the HUGE_MEM memory page allocation method may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g " + "or PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " + "but the current CANN version does not support this feature. " + "Please upgrade the CANN package version."); return false; } - if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) { + if (!IsGteDriverVersion(kMinDriverVersion)) { TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. " "Using the HUGE_MEM memory page allocation method may result in performance degradation. " - "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g " + "or PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " "but the current driver version does not support this feature. " + "Please upgrade the HDK(driver) package version."); + return false; + } + return true; + }(); + return is_support_page_size_1g; +} + +bool IsSupport16GVersion() +{ + static bool is_support_page_size_16g = []() { + if (!IsGteCANNVersion(kMinCannVersionSupport16G, kCannModule)) { + TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. " + "Using the ACL_HBM_MEM_HUGE1G memory page allocation method may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " + "but the current CANN version does not support this feature. " "Please upgrade the CANN package version."); return false; } - if (!IsGteDriverVersion(kMinDriverVersion)) { - TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. " + if (!IsGteDriverVersion(kMinDriverVersionSupport16G)) { + TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. " "Using the HUGE_MEM memory page allocation method may result in performance degradation. " - "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " "but the current driver version does not support this feature. " "Please upgrade the HDK(driver) package version."); return false; } return true; }(); + return is_support_page_size_16g; +} + +int realUseAllocPageMem(){ + if(c10_npu::NPUCachingAllocator::isConfig16GPageSizeEnable()){ - return !is_small_pool && is_support_page_size_1g; + // 满足16g版本并且有足够的16g大页块申请 + if (IsSupport16GVersion() && segmentSizeUsed == used16GPage) { + return used16GPage; + }; + if(IsSupport1GVersion()){ + TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. " + "Using the ACL_HBM_MEM_HUGE1G memory page allocation method may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " + "but the current driver and CANN version does not support this feature. " + "Please upgrade the HDK(driver) and CANN package version." + "The minimum version of the HDK (driver) needs to be greater than 25.1.RC1 " + "and The minimum version of the CANN needs to be greater than 8.2.RC1." + "Or there may not be enough 16g virtual memory blocks allocated."); + segmentSizeUsed=used1GPage; + return used1GPage; + }; + } + if(c10_npu::NPUCachingAllocator::isConfig1GPageSizeEnable()){ + if(IsSupport1GVersion()){ + return used1GPage; + }; + } + return 0; } + struct Block; struct PrivatePool; using Comparison = bool (*)(const Block *, const Block *); @@ -278,6 +337,13 @@ struct SegmentRange { }; +struct SegmentHandles { + aclrtDrvMemHandle drvMem_Handle; + size_t segment_size; + size_t segment_block_count; + size_t segment_block_head; +}; + /* Note [Expandable Segments] Rationale @@ -355,12 +421,13 @@ bevhavior for allocator tensors that need to be used cross-process. */ struct ExpandableSegment { - ExpandableSegment(int device, aclrtStream stream, size_t size) + ExpandableSegment(int device, aclrtStream stream, size_t size,size_t blockCout) : device_(device), stream_(stream), max_handles_(0), // 2MB for small pool, 20MB for large pool - segment_size_(size) + segment_size_(size), + segment_block_count_(blockCout) { size_t device_free; size_t device_total; @@ -382,9 +449,9 @@ struct ExpandableSegment { } NPU_CHECK_ERROR( - c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm())); - ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, segment_size=%zu", - segment_size_ * max_handles_, segment_size_); + c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_block_count_*segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm())); + ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, max_handles_=%zu,segment_size=%zu,segment_block_count_=%zu", + segment_block_count_*segment_size_ * max_handles_, max_handles_,segment_size_,segment_block_count_); } // begin must be aligned to segment_size_. // returns the actual range mapped, which may be @@ -392,6 +459,11 @@ struct ExpandableSegment { // return size of 0 indicates OOM SegmentRange map(SegmentRange range) { + if (realUseAllocPageMem() == used1GPage) { + segment_block_count_ = used1GPage; + } else if (realUseAllocPageMem() == used16GPage) { + segment_block_count_ = used16GPage; + } auto begin = segmentLeft(range.ptr); auto end = segmentRight(range.ptr + range.size); TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr, PTA_ERROR(ErrCode::PTR)); @@ -401,35 +473,86 @@ struct ExpandableSegment { while (end > handles_.size()) { handles_.emplace_back(c10::nullopt); } + size_t segment_len = end - begin; + long remain_size = range.size; + ASCEND_LOGD("NPUCachingAllocator map:begin=%zu, end=%zu, remain_size=%zu",begin,end, remain_size); + auto current_segment_size = segment_size_*segment_block_count_; + size_t realEnd = end; for (auto i : c10::irange(begin, end)) { - TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL)); + //伪装降级 +// if(segmentSizeUsed=16&&i=!0){ +// current_segment_size=kExtraLargeBuffer; +// segmentSizeUsed=1; +// } + // TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL)); + SegmentHandles segmentHandles; aclrtDrvMemHandle handle = nullptr; aclrtPhysicalMemProp prop = {}; prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; - prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE; + prop.memAttr = ACL_HBM_MEM_HUGE; + if (realUseAllocPageMem() == used1GPage) { + prop.memAttr = ACL_HBM_MEM_HUGE1G; + } else if (realUseAllocPageMem() == used16GPage) { + prop.memAttr = ACL_HBM_MEM_HUGE16G; + } prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; prop.location.id = static_cast(device_); prop.reserve = 0; - auto status = c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0); - if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) { - for (auto j : c10::irange(begin, i)) { - auto h = handles_.at(j).value(); - handles_.at(j) = c10::nullopt; - NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + // 如果是16g的头部块,则进行物理内存的申请。或者如果失败了,则降级到1G + size_t headId = getSegmentBlockHead(begin,end,i); + ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remain_size=%zu",headId,i, current_segment_size,remain_size); + if (headId == i && remain_size >= 0) { + std::vector segment_size_level; + segment_size_level.push_back(kExtraLargeBuffer); + auto status = c10_npu::acl::AclrtMallocPhysical(&handle, current_segment_size, &prop, 0, + segment_size_level, + &segmentSizeUsed, 0); + if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) { + for (auto j: c10::irange(begin, i)) { + auto h = handles_.at(j).value().drvMem_Handle; + handles_.at(j) = c10::nullopt; + NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + } + trimHandles(); + return rangeFromHandles(begin, begin); } - trimHandles(); - return rangeFromHandles(begin, begin); + NPU_CHECK_ERROR(status, "aclrtMallocPhysical"); + remain_size = remain_size - current_segment_size; + usedSegmentSizes_.push_back(current_segment_size); + segmentHandles.segment_block_head = headId; + segmentHandles.drvMem_Handle=handle; + segmentHandles.segment_size=segment_size_; + segmentHandles.segment_block_count=segmentSizeUsed; + handles_.at(i) = segmentHandles; + } + + if(remain_size<0){ + realEnd=i; + break; } - NPU_CHECK_ERROR(status, "aclrtMallocPhysical"); - handles_.at(i) = handle; } - for (auto i : c10::irange(begin, end)) { - NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0, - handles_.at(i).value(), 0, getHcclComm())); + auto start_ptr = range.ptr; + size_t total_size = 0; + for (auto i: c10::irange(begin, end)) { + if(i>realEnd){ + continue; + } + SegmentHandles segmentHandles=handles_.at(i).value(); + ASCEND_LOGD("NPUCachingAllocator map:i=%zu, segmentHandles.segment_size=%zu, segmentHandles.drvMem_Handle=%zu",i, + segmentHandles.segment_size*segmentHandles.segment_block_count,segmentHandles.drvMem_Handle==nullptr); + if(segmentHandles.drvMem_Handle!=nullptr){ + size_t usedSegmentSizes= segmentHandles.segment_size*segmentHandles.segment_block_count; + NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0, + segmentHandles.drvMem_Handle, 0, getHcclComm())); + start_ptr=start_ptr+usedSegmentSizes; + total_size = total_size+usedSegmentSizes; + ASCEND_LOGD("NPUCachingAllocator map:i=%zu, segment_size=%zu,total_size=%zu",i, usedSegmentSizes_[i],total_size); + } } - ASCEND_LOGD("NPUCachingAllocator map: segment_size=%zu", segment_size_); - return rangeFromHandles(begin, end); + //return rangeFromHandles(begin, end); + ASCEND_LOGD("NPUCachingAllocator ================>total_size=%zu", total_size); + return SegmentRange(range.ptr, total_size); } // unmaps all the completely empty segment_size_ segments between @@ -437,13 +560,14 @@ struct ExpandableSegment { // and the actual size unmapped (multiple of segment_size_) SegmentRange unmap(SegmentRange range) { - auto begin = segmentRight(range.ptr); + auto begin = unmapSegmentRight(range.ptr); auto end = segmentLeft(range.ptr + range.size); + ASCEND_LOGD("NPUCachingAllocator =unmap===============>begin=%zu,end=%zu,size=%zu", begin,end,range.size); if (begin >= end) { return SegmentRange{ range.ptr, 0 }; } - unmapHandles(begin, end); - return rangeFromHandles(begin, end); + return unmapHandles(begin, end); +// return rangeFromHandles(begin, end); } char *ptr() const @@ -453,7 +577,7 @@ struct ExpandableSegment { size_t size() const { - return max_handles_ * segment_size_; + return max_handles_ * segment_size_ * segment_block_count_; } void setHcclComm(std::shared_ptr hcclComm) @@ -464,7 +588,7 @@ struct ExpandableSegment { segment_size_ * max_handles_, 0, 1)); for (auto i : c10::irange(handles_.size())) { HCCL_CHECK_ERROR(at_npu::hccl::HcclCommActivateCommMemoryFace(hcclComm_->getHcclComm(), - (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value(), 0)); + (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value().drvMem_Handle, 0)); } } @@ -476,7 +600,7 @@ struct ExpandableSegment { } private: - void unmapHandles(size_t begin, size_t end) + SegmentRange unmapHandles(size_t begin, size_t end) { // note: unlike aclrtFree, MemUnmap and MemRelease do // not appear to synchronize in all cases, so we have to wait for the @@ -492,14 +616,39 @@ private: trigger->traceNpuStreamSynchronization(reinterpret_cast(stream_)); } #endif + size_t beginUnmap=0; + size_t endUnmap=0; for (auto i : c10::irange(begin, end)) { - aclrtDrvMemHandle h = handles_.at(i).value(); - handles_.at(i) = c10::nullopt; - NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm())); - NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + ASCEND_LOGD("NPUCachingAllocator unmap: handles_.at(i)=%zu", handles_.at(i)==c10::nullopt); + if (handles_.at(i) == c10::nullopt) { + continue; + } + SegmentHandles segmentHandles=handles_.at(i).value(); + // i=headid,&&segment_block_count+i<=end===>释放这个handle + size_t segmentBlockCount=segmentHandles.segment_block_count; + size_t segmentBlockHead=segmentHandles.segment_block_head; + ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i,begin,end); + if(i==segmentBlockHead&&segmentBlockCount+i<=end){ + if(beginUnmap==0){ + beginUnmap=segmentBlockHead; + } + endUnmap=segmentBlockHead+segmentBlockCount; + aclrtDrvMemHandle h = segmentHandles.drvMem_Handle; + NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm())); + NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i,segment_size_,begin,end); + handles_.at(i) = c10::nullopt; + } + } - ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_); trimHandles(); + if (beginUnmap == 0 && endUnmap == 0) { + beginUnmap = begin; + endUnmap = begin; + } + ASCEND_LOGD("NPUCachingAllocator unmapHandles: beginUnmap=%zu,endUnmap=%zu", beginUnmap,endUnmap); + return rangeFromHandles(beginUnmap, endUnmap); + //return SegmentRange(beginUnmap, endUnmap); } void trimHandles() @@ -524,7 +673,8 @@ private: size_t numSegments(size_t size) { - return (size + segment_size_ - 1) / segment_size_; + ASCEND_LOGD("numSegments unmap: size=%zu,segment_size_=%zu,segment_block_count_=%zu,end=%zu", size,segment_size_,segment_block_count_); + return (size + segment_size_*segment_block_count_ - 1) / (segment_size_*segment_block_count_); } size_t segmentLeft(char *p) @@ -536,11 +686,20 @@ private: size_t segmentRight(char *p) { auto size = p - ptr(); - return numSegments(size); + return numSegments(size)*segment_block_count_; } - SegmentRange rangeFromHandles(size_t begin, size_t end) + size_t unmapSegmentRight(char *p) { + auto size = p - ptr(); + return (size + segment_size_- 1) / (segment_size_); + } + + size_t getSegmentBlockHead(size_t begin, size_t end, size_t index) { + return ((index - begin) / segmentSizeUsed) * segmentSizeUsed + begin; + } + + SegmentRange rangeFromHandles(size_t begin, size_t end) { return SegmentRange(ptr() + segment_size_ * begin, segment_size_ * (end - begin)); } @@ -557,7 +716,9 @@ private: void *ptr_{}; size_t max_handles_; size_t segment_size_; - std::vector> handles_; + size_t segment_block_count_; + std::vector usedSegmentSizes_; + std::vector> handles_; std::shared_ptr hcclComm_; }; @@ -776,6 +937,10 @@ public: { return instance().m_page_size_1g; } + static bool page_size_16g_enable() + { + return instance().m_page_size_16g; + } static CachingAllocatorConfig &instance() { @@ -797,6 +962,7 @@ private: bool set_expandable_segments_flag = false; size_t m_base_addr_aligned_size = kAlignRoundLarge; bool m_page_size_1g = false; // 新增1G页配置标志 + bool m_page_size_16g = false; // 新增1G页配置标志 CachingAllocatorConfig() : m_max_split_size(std::numeric_limits::max()), @@ -923,7 +1089,10 @@ size_t CachingAllocatorConfig::parsePageSize(const std::vector &con if (config[i + 2] == "1g") { m_page_size_1g = true; - } else { + } else if(config[i + 2] == "16g"){ + m_page_size_16g = true; + segmentSizeUsed=used16GPage; + }else { TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE)); } return i + 2; // 返回最后处理的索引位置 @@ -987,6 +1156,10 @@ bool isConfig1GPageSizeEnable() return CachingAllocatorConfig::page_size_1g_enable(); } +bool isConfig16GPageSizeEnable() +{ + return CachingAllocatorConfig::page_size_16g_enable(); +} // To prevent the deadlock situation, temporarily release the lock. // // Deadlock Scenario Description: @@ -1230,12 +1403,19 @@ public: process_events(context); } auto size = round_size(orig_size); + + // 如果开启了1g或者16G大页,则默认使用大内存池,进行分配内存,不再根据内存小,而优先选择小内存池的场景了 auto &pool = get_pool(size, stream); - // 开环境变量 大池子放1G内存块 - const size_t alloc_size = IsMallocPage1GMem(pool.is_small) ? - kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer) : - get_allocation_size(size); + // 开环境变量 进行16G和1G的内存对齐,如果没有配置就使用普通内存对齐get_allocation_size + size_t alloc_size = 0; + if (realUseAllocPageMem() == used1GPage) { + alloc_size = kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer); + } else if (realUseAllocPageMem() == used16GPage) { + alloc_size = kExtraLarge16GBuffer * ((size + kExtraLarge16GBuffer - 1) / kExtraLarge16GBuffer); + } else { + alloc_size = get_allocation_size(size); + } AllocParams params(device, size, stream, &pool, alloc_size, stats); params.stat_types = get_stat_types_for_pool(pool); @@ -2132,14 +2312,19 @@ private: return c; } } + size_t segment_block_count = used1GPage; auto segment_size = pool->is_small ? kSmallBuffer : (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer); // 此处申请虚拟内存,segment_size是页大小,实际虚拟内存巨大 - if (IsMallocPage1GMem(pool->is_small)) { + if (realUseAllocPageMem() == used1GPage) { segment_size = kExtraLargeBuffer; + segment_block_count = used1GPage; + } else if (realUseAllocPageMem() == used16GPage) { + segment_size = kExtraLargeBuffer; + segment_block_count = used16GPage; } - auto segment = new ExpandableSegment(device, stream, segment_size); + auto segment = new ExpandableSegment(device, stream, segment_size, segment_block_count); if (hcclComm_) { segment->setHcclComm(hcclComm_); } @@ -2158,6 +2343,7 @@ private: TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size, PTA_ERROR(ErrCode::INTERNAL)); TORCH_INTERNAL_ASSERT(!to_map->context_when_allocated); // unmapped blocks should not keep // history + BlockPool &pool = *to_map->pool; auto mapped_range = to_map->expandable_segment_->map(SegmentRange{ to_map->ptr, size }); // failed to map the memory if (mapped_range.size == 0) { @@ -2166,7 +2352,6 @@ private: TORCH_INTERNAL_ASSERT(mapped_range.ptr == to_map->ptr && mapped_range.size >= size, PTA_ERROR(ErrCode::INTERNAL)); - BlockPool &pool = *to_map->pool; pool.unmapped.erase(to_map); to_map->mapped = true; @@ -2333,6 +2518,9 @@ private: if (entry.second(stream)) { auto it1 = graph_pools.find(entry.first); TORCH_INTERNAL_ASSERT(it1 != graph_pools.end()); + if (realUseAllocPageMem() > 0) { + return it1->second->large_blocks; + } if (size <= kSmallSize) { return it1->second->small_blocks; } else { @@ -2341,6 +2529,9 @@ private: } } } + if (realUseAllocPageMem() > 0) { + return large_blocks; + } if (size <= kSmallSize) { return small_blocks; } else { @@ -2543,11 +2734,16 @@ private: ptr = active_pool->allocator()->raw_alloc(size); p.err = ptr ? ACL_ERROR_NONE : ACL_ERROR_RT_MEMORY_ALLOCATION; } else { + std::vector alloc_size_level; + // 使用origin size进行1g对齐 + alloc_size_level.push_back(kExtraLargeBuffer * ((p.size() + kExtraLargeBuffer - 1) / kExtraLargeBuffer)); auto policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST; - if (IsMallocPage1GMem(p.pool->is_small)) { + if (realUseAllocPageMem() == used1GPage) { policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY; + } else if (realUseAllocPageMem() == used16GPage) { + policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY; } - p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy); + p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy, alloc_size_level, &segmentSizeUsed); } if (p.err != ACL_ERROR_NONE) { return false; @@ -3293,7 +3489,7 @@ public: deleteFunc = &uncached_delete; size_t alloc_size = size + 32; NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size, - aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST)); + aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}, &segmentSizeUsed)); ASCEND_LOGD("Without NPUCachingAllocator, malloc by " "AclrtMallocAlign32: size=%zu", alloc_size); @@ -3322,7 +3518,8 @@ public: deleteFunc = &uncached_delete; size_t alloc_size = size + 32 + aligned; NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&realPtr, alloc_size, - aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST)); + aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, + {}, &segmentSizeUsed)); ASCEND_LOGD("Without NPUCachingAllocator, malloc by " "AclrtMallocAlign32: size=%zu", alloc_size); } else { diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index a4e14d2232..daf1f07ece 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -445,6 +445,8 @@ bool checkConfigExpandableSegments(); bool isConfig1GPageSizeEnable(); +bool isConfig16GPageSizeEnable(); + } // namespace NPUCachingAllocator } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index fa4e79ff7b..9d3f8497f0 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -128,7 +128,7 @@ public: PTA_ERROR(ErrCode::MEMORY)); aclError err = c10_npu::acl::AclrtMallocAlign32( - &block->data_ptr, block->size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY); + &block->data_ptr, block->size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY, {}, nullptr); if (err != ACL_ERROR_NONE) { return nullptr; } @@ -510,7 +510,8 @@ public: if (size != 0) { size_t alloc_size = size + 32; NPU_CHECK_ERROR( - c10_npu::acl::AclrtMallocAlign32(&dev_ptr, alloc_size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY)); + c10_npu::acl::AclrtMallocAlign32(&dev_ptr, alloc_size, + aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY, {}, nullptr)); } } else { if (size != 0) { diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index b59e9c85c9..aa81bdebfb 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -433,9 +433,10 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u return func(deviceId, utilizationInfo); } -aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy) { - typedef aclError (*AclrtMallocAlign32)(void**, size_t, aclrtMemMallocPolicy); - static AclrtMallocAlign32 func = (AclrtMallocAlign32)GET_FUNC(aclrtMallocAlign32); +aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy, + std::vector allocSizeLevel, uint32_t *segmentSizeUsed) { + typedef aclError (*AclrtMallocAlign32)(void **, size_t, aclrtMemMallocPolicy); + static AclrtMallocAlign32 func = (AclrtMallocAlign32) GET_FUNC(aclrtMallocAlign32); aclError ret; if (func != nullptr) { ret = func(devPtr, size, policy); @@ -444,13 +445,35 @@ aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy pol ret = aclrtMalloc(devPtr, size, policy); } - if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY)) { + if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY)) { + TORCH_NPU_WARN_ONCE("TThe malloc 16G large-page physical memory failed, " + "so try to malloc 1G large-page physical memory." + "Using the 1G large-page physical memory page may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration " + "is enabled, but the pre-allocated number of 16G large pages is insufficient " + "or 16G large-page memory pre-allocation is not enabled."); + policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY; + // 进行size对齐的调整 + size = allocSizeLevel[0]; + // 16g失败,标志位设置成1 + *segmentSizeUsed = 1; + if (func != nullptr) { + ret = func(devPtr, size, policy); + } else { + TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtMallocAlign32"); + ret = aclrtMalloc(devPtr, size, policy); + } + }; + if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY || + policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY)) { TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory." "Using the 2M memory page may result in performance degradation. " "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is " "enabled, but the pre-allocated number of 1G large pages is insufficient or 1G large-page " "memory pre-allocation is not enabled."); policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST; + size = allocSizeLevel[0]; + *segmentSizeUsed = 1; if (func != nullptr) { ret = func(devPtr, size, policy); } else { @@ -521,16 +544,34 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm) return ret; } -aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop, - uint64_t flags) { - typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle*, size_t, const aclrtPhysicalMemProp*, uint64_t); +aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, + uint64_t flags, std::vector segmentSizeLevel, uint32_t *segmentSizeUsed, + bool resetSegmentSizeFlags) { + typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle *, size_t, const aclrtPhysicalMemProp *, uint64_t); static AclrtMallocPhysical func = nullptr; if (func == nullptr) { - func = (AclrtMallocPhysical)GET_FUNC(aclrtMallocPhysical); + func = (AclrtMallocPhysical) GET_FUNC(aclrtMallocPhysical); } TORCH_CHECK(func, "Failed to find function ", "aclrtMallocPhysical", PTA_ERROR(ErrCode::NOT_FOUND)); aclError ret = func(handle, size, prop, flags); - if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G)) { + if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE16G)) { + TORCH_NPU_WARN_ONCE("The malloc 16G large-page physical memory failed, " + "so try to malloc 1G large-page physical memory." + "Using the 1G large-page physical memory page may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration " + "is enabled, but the pre-allocated number of 16G large pages is insufficient " + "or 16G large-page memory pre-allocation is not enabled."); + aclrtPhysicalMemProp prop_update = {prop->handleType, + prop->allocationType, + ACL_HBM_MEM_HUGE1G, + {prop->location.id, + prop->location.type}, + prop->reserve}; + size = segmentSizeLevel[0]; + *segmentSizeUsed = 1; + ret = func(handle, size, &prop_update, flags); + }; + if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G || prop->memAttr == ACL_HBM_MEM_HUGE16G)) { TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory." "Using the 2M memory page may result in performance degradation. " "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration " @@ -542,6 +583,7 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrt {prop->location.id, prop->location.type}, prop->reserve}; + size = segmentSizeLevel[0]; ret = func(handle, size, &prop_update, flags); } return ret; @@ -819,7 +861,7 @@ aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *ve if (func == nullptr) { func = (aclsysGetCANNVersionFunc)GET_FUNC(aclsysGetCANNVersion); if (func == nullptr) { - return ACL_ERROR_RT_FEATURE_NOT_SUPPORT; + return ACL_ERROR_RT_FEATURE_NOT_SUPPORT; } } diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 9cdad2663b..7121fb1125 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -137,7 +137,8 @@ aclError AclrtDestroyStreamForce(aclrtStream stream); aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *utilizationInfo); -aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy); +aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy, + std::vector allocSizeLevel, uint32_t *segmentSizeUsed); aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status); @@ -148,7 +149,9 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm = nullptr); -aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop, uint64_t flags); +aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, uint64_t flags, + std::vector segmentSizeLevel, uint32_t *segmentSizeUsed, + bool resetSegmentSizeFlags); aclError AclrtFreePhysical(aclrtDrvMemHandle handle); diff --git a/torch_npu/csrc/npu/Stress_detect.cpp b/torch_npu/csrc/npu/Stress_detect.cpp index 3fcade819b..63301c5bc6 100644 --- a/torch_npu/csrc/npu/Stress_detect.cpp +++ b/torch_npu/csrc/npu/Stress_detect.cpp @@ -109,10 +109,12 @@ int StressDetector::perform_stress_detect(int deviceid) uint64_t size = 10; workspaceSize = size << 10 << 10 << 10; // Assume memory size if (workspaceSize > 0) { - auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST, {}, + nullptr); if (ret != ACL_ERROR_NONE) { c10_npu::NPUCachingAllocator::emptyCache(); - ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST, {}, + nullptr); if (ret != ACL_ERROR_NONE) { ASCEND_LOGW("call AclrtMallocAlign32 failed, ERROR : %d. Skip StressDetect.", ret); task_in_progress.store(false); // Task ends -- Gitee From 8e86e1f1dacbd9f0e952fd482da0901747cb0133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Tue, 8 Jul 2025 19:17:02 +0800 Subject: [PATCH 2/8] c --- torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 15 ++++++++------- .../csrc/core/npu/interface/AclInterface.cpp | 1 + 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 70245b856c..cb600819ec 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -627,16 +627,17 @@ private: // i=headid,&&segment_block_count+i<=end===>释放这个handle size_t segmentBlockCount=segmentHandles.segment_block_count; size_t segmentBlockHead=segmentHandles.segment_block_head; - ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i,begin,end); - if(i==segmentBlockHead&&segmentBlockCount+i<=end){ - if(beginUnmap==0){ - beginUnmap=segmentBlockHead; + ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,begin=%zu,end=%zu", i,begin,end); + if (i == segmentBlockHead && segmentBlockCount + i <= end) { + if (beginUnmap == 0) { + beginUnmap = segmentBlockHead; } - endUnmap=segmentBlockHead+segmentBlockCount; + endUnmap = segmentBlockHead + segmentBlockCount; aclrtDrvMemHandle h = segmentHandles.drvMem_Handle; - NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm())); + NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *) ptr_ + segment_size_ * i, getHcclComm())); NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); - ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i,segment_size_,begin,end); + ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i, segment_size_, + begin, end); handles_.at(i) = c10::nullopt; } diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index aa81bdebfb..48fbe75f95 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -584,6 +584,7 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclr prop->location.type}, prop->reserve}; size = segmentSizeLevel[0]; + *segmentSizeUsed = 1; ret = func(handle, size, &prop_update, flags); } return ret; -- Gitee From 923037123131c8eae640274f2eea32a25e26cab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Tue, 8 Jul 2025 20:52:32 +0800 Subject: [PATCH 3/8] cc --- .../csrc/core/npu/NPUCachingAllocator.cpp | 76 ++++++++----------- .../csrc/core/npu/interface/AclInterface.cpp | 3 +- 2 files changed, 32 insertions(+), 47 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index cb600819ec..715109663d 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -475,16 +475,10 @@ struct ExpandableSegment { } size_t segment_len = end - begin; long remain_size = range.size; - ASCEND_LOGD("NPUCachingAllocator map:begin=%zu, end=%zu, remain_size=%zu",begin,end, remain_size); auto current_segment_size = segment_size_*segment_block_count_; size_t realEnd = end; for (auto i : c10::irange(begin, end)) { - //伪装降级 -// if(segmentSizeUsed=16&&i=!0){ -// current_segment_size=kExtraLargeBuffer; -// segmentSizeUsed=1; -// } - // TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL)); +// TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL)); SegmentHandles segmentHandles; aclrtDrvMemHandle handle = nullptr; aclrtPhysicalMemProp prop = {}; @@ -500,14 +494,12 @@ struct ExpandableSegment { prop.location.id = static_cast(device_); prop.reserve = 0; // 如果是16g的头部块,则进行物理内存的申请。或者如果失败了,则降级到1G - size_t headId = getSegmentBlockHead(begin,end,i); - ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remain_size=%zu",headId,i, current_segment_size,remain_size); + size_t headId = getSegmentBlockHead(begin, end, i); + ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remain_size=%zu", headId, + i, current_segment_size, remain_size); if (headId == i && remain_size >= 0) { - std::vector segment_size_level; - segment_size_level.push_back(kExtraLargeBuffer); auto status = c10_npu::acl::AclrtMallocPhysical(&handle, current_segment_size, &prop, 0, - segment_size_level, - &segmentSizeUsed, 0); + {kExtraLargeBuffer}, &segmentSizeUsed); if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) { for (auto j: c10::irange(begin, i)) { auto h = handles_.at(j).value().drvMem_Handle; @@ -521,37 +513,35 @@ struct ExpandableSegment { remain_size = remain_size - current_segment_size; usedSegmentSizes_.push_back(current_segment_size); segmentHandles.segment_block_head = headId; - segmentHandles.drvMem_Handle=handle; - segmentHandles.segment_size=segment_size_; - segmentHandles.segment_block_count=segmentSizeUsed; + segmentHandles.drvMem_Handle = handle; + segmentHandles.segment_size = segment_size_; + segmentHandles.segment_block_count = segmentSizeUsed; + segment_block_count_ = segmentSizeUsed; handles_.at(i) = segmentHandles; } - if(remain_size<0){ - realEnd=i; + if (remain_size < 0) { + realEnd = i; break; } } auto start_ptr = range.ptr; size_t total_size = 0; for (auto i: c10::irange(begin, end)) { - if(i>realEnd){ + if (i > realEnd || handles_.at(i) == c10::nullopt) { continue; } - SegmentHandles segmentHandles=handles_.at(i).value(); - ASCEND_LOGD("NPUCachingAllocator map:i=%zu, segmentHandles.segment_size=%zu, segmentHandles.drvMem_Handle=%zu",i, - segmentHandles.segment_size*segmentHandles.segment_block_count,segmentHandles.drvMem_Handle==nullptr); - if(segmentHandles.drvMem_Handle!=nullptr){ - size_t usedSegmentSizes= segmentHandles.segment_size*segmentHandles.segment_block_count; + SegmentHandles segmentHandles = handles_.at(i).value(); + if (segmentHandles.drvMem_Handle != nullptr) { + size_t usedSegmentSizes = segmentHandles.segment_size * segmentHandles.segment_block_count; NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0, segmentHandles.drvMem_Handle, 0, getHcclComm())); - start_ptr=start_ptr+usedSegmentSizes; - total_size = total_size+usedSegmentSizes; - ASCEND_LOGD("NPUCachingAllocator map:i=%zu, segment_size=%zu,total_size=%zu",i, usedSegmentSizes_[i],total_size); + start_ptr = start_ptr + usedSegmentSizes; + total_size = total_size + usedSegmentSizes; + ASCEND_LOGD("NPUCachingAllocator map:i=%zu", i); } } - //return rangeFromHandles(begin, end); - ASCEND_LOGD("NPUCachingAllocator ================>total_size=%zu", total_size); + ASCEND_LOGD("NPUCachingAllocator map total_size=%zu", total_size); return SegmentRange(range.ptr, total_size); } @@ -562,12 +552,11 @@ struct ExpandableSegment { { auto begin = unmapSegmentRight(range.ptr); auto end = segmentLeft(range.ptr + range.size); - ASCEND_LOGD("NPUCachingAllocator =unmap===============>begin=%zu,end=%zu,size=%zu", begin,end,range.size); + ASCEND_LOGD("NPUCachingAllocator start to unmap,begin=%zu,end=%zu,size=%zu", begin, end, range.size); if (begin >= end) { return SegmentRange{ range.ptr, 0 }; } return unmapHandles(begin, end); -// return rangeFromHandles(begin, end); } char *ptr() const @@ -616,18 +605,15 @@ private: trigger->traceNpuStreamSynchronization(reinterpret_cast(stream_)); } #endif - size_t beginUnmap=0; - size_t endUnmap=0; - for (auto i : c10::irange(begin, end)) { - ASCEND_LOGD("NPUCachingAllocator unmap: handles_.at(i)=%zu", handles_.at(i)==c10::nullopt); + size_t beginUnmap = 0; + size_t endUnmap = 0; + for (auto i: c10::irange(begin, end)) { if (handles_.at(i) == c10::nullopt) { continue; } - SegmentHandles segmentHandles=handles_.at(i).value(); - // i=headid,&&segment_block_count+i<=end===>释放这个handle - size_t segmentBlockCount=segmentHandles.segment_block_count; - size_t segmentBlockHead=segmentHandles.segment_block_head; - ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,begin=%zu,end=%zu", i,begin,end); + SegmentHandles segmentHandles = handles_.at(i).value(); + size_t segmentBlockCount = segmentHandles.segment_block_count; + size_t segmentBlockHead = segmentHandles.segment_block_head; if (i == segmentBlockHead && segmentBlockCount + i <= end) { if (beginUnmap == 0) { beginUnmap = segmentBlockHead; @@ -636,8 +622,8 @@ private: aclrtDrvMemHandle h = segmentHandles.drvMem_Handle; NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *) ptr_ + segment_size_ * i, getHcclComm())); NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); - ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,segment_size=%zu,begin=%zu,end=%zu", i, segment_size_, - begin, end); + ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,begin=%zu,end=%zu", i, segmentBlockHead, + segmentBlockCount + segmentBlockHead); handles_.at(i) = c10::nullopt; } @@ -647,9 +633,9 @@ private: beginUnmap = begin; endUnmap = begin; } - ASCEND_LOGD("NPUCachingAllocator unmapHandles: beginUnmap=%zu,endUnmap=%zu", beginUnmap,endUnmap); + ASCEND_LOGD("NPUCachingAllocator total unmapHandles: beginUnmap=%zu,endUnmap=%zu,segment_size_=%zu", beginUnmap, + endUnmap, segment_size_); return rangeFromHandles(beginUnmap, endUnmap); - //return SegmentRange(beginUnmap, endUnmap); } void trimHandles() @@ -697,7 +683,7 @@ private: } size_t getSegmentBlockHead(size_t begin, size_t end, size_t index) { - return ((index - begin) / segmentSizeUsed) * segmentSizeUsed + begin; + return ((index - begin) / segment_block_count_) * segment_block_count_ + begin; } SegmentRange rangeFromHandles(size_t begin, size_t end) { diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 48fbe75f95..b3904d43c6 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -545,8 +545,7 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm) } aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, - uint64_t flags, std::vector segmentSizeLevel, uint32_t *segmentSizeUsed, - bool resetSegmentSizeFlags) { + uint64_t flags, std::vector segmentSizeLevel, uint32_t *segmentSizeUsed) { typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle *, size_t, const aclrtPhysicalMemProp *, uint64_t); static AclrtMallocPhysical func = nullptr; if (func == nullptr) { -- Gitee From 52d3ee2d34f47f1ad71f5ae3c3931ffb7446c60b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Wed, 9 Jul 2025 09:10:22 +0800 Subject: [PATCH 4/8] CC --- torch_npu/csrc/core/npu/interface/AclInterface.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 7121fb1125..c99d69e467 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -150,8 +150,7 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm = nullptr); aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, uint64_t flags, - std::vector segmentSizeLevel, uint32_t *segmentSizeUsed, - bool resetSegmentSizeFlags); + std::vector segmentSizeLevel, uint32_t *segmentSizeUsed); aclError AclrtFreePhysical(aclrtDrvMemHandle handle); -- Gitee From 2fb23b1c153dba2cdf7d5120f2535fba9a2e2108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Wed, 9 Jul 2025 10:27:45 +0800 Subject: [PATCH 5/8] cc --- .../csrc/core/npu/NPUCachingAllocator.cpp | 26 +++++++++---------- .../csrc/core/npu/interface/AclInterface.cpp | 14 +++++----- .../csrc/core/npu/interface/AclInterface.h | 4 +-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 715109663d..2d789e59a7 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -104,7 +104,7 @@ const std::string kMinDriverVersionSupport16G = "25.1.RC1"; // minimum drive const std::string kCannModule = "CANN"; // cann module name constexpr int used1GPage = 1; // used 1g to alloc constexpr int used16GPage = 16; // used 16g to alloc -static uint32_t segmentSizeUsed = 1; // segment size used +static uint32_t segmentUsedPageType = 1; // segment size used using StatTypes = std::array(StatType::NUM_TYPES)>; @@ -204,7 +204,7 @@ int realUseAllocPageMem(){ if(c10_npu::NPUCachingAllocator::isConfig16GPageSizeEnable()){ // 满足16g版本并且有足够的16g大页块申请 - if (IsSupport16GVersion() && segmentSizeUsed == used16GPage) { + if (IsSupport16GVersion() && segmentUsedPageType == used16GPage) { return used16GPage; }; if(IsSupport1GVersion()){ @@ -216,7 +216,7 @@ int realUseAllocPageMem(){ "The minimum version of the HDK (driver) needs to be greater than 25.1.RC1 " "and The minimum version of the CANN needs to be greater than 8.2.RC1." "Or there may not be enough 16g virtual memory blocks allocated."); - segmentSizeUsed=used1GPage; + segmentUsedPageType = used1GPage; return used1GPage; }; } @@ -499,7 +499,7 @@ struct ExpandableSegment { i, current_segment_size, remain_size); if (headId == i && remain_size >= 0) { auto status = c10_npu::acl::AclrtMallocPhysical(&handle, current_segment_size, &prop, 0, - {kExtraLargeBuffer}, &segmentSizeUsed); + {kExtraLargeBuffer}, &segmentUsedPageType); if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) { for (auto j: c10::irange(begin, i)) { auto h = handles_.at(j).value().drvMem_Handle; @@ -515,8 +515,8 @@ struct ExpandableSegment { segmentHandles.segment_block_head = headId; segmentHandles.drvMem_Handle = handle; segmentHandles.segment_size = segment_size_; - segmentHandles.segment_block_count = segmentSizeUsed; - segment_block_count_ = segmentSizeUsed; + segmentHandles.segment_block_count = segmentUsedPageType; + segment_block_count_ = segmentUsedPageType; handles_.at(i) = segmentHandles; } @@ -538,7 +538,6 @@ struct ExpandableSegment { segmentHandles.drvMem_Handle, 0, getHcclComm())); start_ptr = start_ptr + usedSegmentSizes; total_size = total_size + usedSegmentSizes; - ASCEND_LOGD("NPUCachingAllocator map:i=%zu", i); } } ASCEND_LOGD("NPUCachingAllocator map total_size=%zu", total_size); @@ -1078,7 +1077,7 @@ size_t CachingAllocatorConfig::parsePageSize(const std::vector &con m_page_size_1g = true; } else if(config[i + 2] == "16g"){ m_page_size_16g = true; - segmentSizeUsed=used16GPage; + segmentUsedPageType=used16GPage; }else { TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE)); } @@ -2721,16 +2720,17 @@ private: ptr = active_pool->allocator()->raw_alloc(size); p.err = ptr ? ACL_ERROR_NONE : ACL_ERROR_RT_MEMORY_ALLOCATION; } else { - std::vector alloc_size_level; + std::vector alloc_size_level; // 使用origin size进行1g对齐 - alloc_size_level.push_back(kExtraLargeBuffer * ((p.size() + kExtraLargeBuffer - 1) / kExtraLargeBuffer)); + alloc_size_level.push_back( + kExtraLargeBuffer * ((p.size() + kExtraLargeBuffer - 1) / kExtraLargeBuffer)); auto policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST; if (realUseAllocPageMem() == used1GPage) { policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY; } else if (realUseAllocPageMem() == used16GPage) { policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY; } - p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy, alloc_size_level, &segmentSizeUsed); + p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy, alloc_size_level, &segmentUsedPageType); } if (p.err != ACL_ERROR_NONE) { return false; @@ -3476,7 +3476,7 @@ public: deleteFunc = &uncached_delete; size_t alloc_size = size + 32; NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size, - aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}, &segmentSizeUsed)); + aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}, &segmentUsedPageType)); ASCEND_LOGD("Without NPUCachingAllocator, malloc by " "AclrtMallocAlign32: size=%zu", alloc_size); @@ -3506,7 +3506,7 @@ public: size_t alloc_size = size + 32 + aligned; NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&realPtr, alloc_size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, - {}, &segmentSizeUsed)); + {}, &segmentUsedPageType)); ASCEND_LOGD("Without NPUCachingAllocator, malloc by " "AclrtMallocAlign32: size=%zu", alloc_size); } else { diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index b3904d43c6..33e6541580 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -434,7 +434,7 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u } aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy, - std::vector allocSizeLevel, uint32_t *segmentSizeUsed) { + std::vector allocSizeLevel, uint32_t *segmentUsedPageType) { typedef aclError (*AclrtMallocAlign32)(void **, size_t, aclrtMemMallocPolicy); static AclrtMallocAlign32 func = (AclrtMallocAlign32) GET_FUNC(aclrtMallocAlign32); aclError ret; @@ -456,7 +456,7 @@ aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy po // 进行size对齐的调整 size = allocSizeLevel[0]; // 16g失败,标志位设置成1 - *segmentSizeUsed = 1; + *segmentUsedPageType = 1; if (func != nullptr) { ret = func(devPtr, size, policy); } else { @@ -473,7 +473,7 @@ aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy po "memory pre-allocation is not enabled."); policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST; size = allocSizeLevel[0]; - *segmentSizeUsed = 1; + *segmentUsedPageType = 1; if (func != nullptr) { ret = func(devPtr, size, policy); } else { @@ -545,11 +545,11 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm) } aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, - uint64_t flags, std::vector segmentSizeLevel, uint32_t *segmentSizeUsed) { + uint64_t flags, std::vector segmentSizeLevel, uint32_t *segmentUsedPageType) { typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle *, size_t, const aclrtPhysicalMemProp *, uint64_t); static AclrtMallocPhysical func = nullptr; if (func == nullptr) { - func = (AclrtMallocPhysical) GET_FUNC(aclrtMallocPhysical); + func = (AclrtMallocPhysical)GET_FUNC(aclrtMallocPhysical); } TORCH_CHECK(func, "Failed to find function ", "aclrtMallocPhysical", PTA_ERROR(ErrCode::NOT_FOUND)); aclError ret = func(handle, size, prop, flags); @@ -567,7 +567,7 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclr prop->location.type}, prop->reserve}; size = segmentSizeLevel[0]; - *segmentSizeUsed = 1; + *segmentUsedPageType = 1; ret = func(handle, size, &prop_update, flags); }; if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G || prop->memAttr == ACL_HBM_MEM_HUGE16G)) { @@ -583,7 +583,7 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclr prop->location.type}, prop->reserve}; size = segmentSizeLevel[0]; - *segmentSizeUsed = 1; + *segmentUsedPageType = 1; ret = func(handle, size, &prop_update, flags); } return ret; diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index c99d69e467..3eae5bb3f0 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -138,7 +138,7 @@ aclError AclrtDestroyStreamForce(aclrtStream stream); aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *utilizationInfo); aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy, - std::vector allocSizeLevel, uint32_t *segmentSizeUsed); + std::vector allocSizeLevel, uint32_t *segmentUsedPageType); aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status); @@ -150,7 +150,7 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm = nullptr); aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, uint64_t flags, - std::vector segmentSizeLevel, uint32_t *segmentSizeUsed); + std::vector segmentSizeLevel, uint32_t *segmentUsedPageType); aclError AclrtFreePhysical(aclrtDrvMemHandle handle); -- Gitee From f793cd3c9fc5d1cc84781796b61aa76171ab092b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Wed, 9 Jul 2025 11:28:22 +0800 Subject: [PATCH 6/8] dd --- .../csrc/core/npu/NPUCachingAllocator.cpp | 57 +++++++++---------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 2d789e59a7..74a3a4ea60 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -228,7 +228,6 @@ int realUseAllocPageMem(){ return 0; } - struct Block; struct PrivatePool; using Comparison = bool (*)(const Block *, const Block *); @@ -421,13 +420,13 @@ bevhavior for allocator tensors that need to be used cross-process. */ struct ExpandableSegment { - ExpandableSegment(int device, aclrtStream stream, size_t size,size_t blockCout) + ExpandableSegment(int device, aclrtStream stream, size_t size,size_t blockCount) : device_(device), stream_(stream), max_handles_(0), // 2MB for small pool, 20MB for large pool segment_size_(size), - segment_block_count_(blockCout) + segment_block_count_(blockCount) { size_t device_free; size_t device_total; @@ -447,11 +446,11 @@ struct ExpandableSegment { max_handles_ = numSegments(kLargePoolVirAddrSize); } } - - NPU_CHECK_ERROR( - c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_block_count_*segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm())); - ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, max_handles_=%zu,segment_size=%zu,segment_block_count_=%zu", - segment_block_count_*segment_size_ * max_handles_, max_handles_,segment_size_,segment_block_count_); + size_t totalReserveMemSize = segment_block_count_ * segment_size_ * max_handles_; + NPU_CHECK_ERROR(c10_npu::acl::AclrtReserveMemAddress(&ptr_, totalReserveMemSize, 0, nullptr, 1, getHcclComm())); + ASCEND_LOGD( + "NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu,segment_size=%zu,segment_block_count_=%zu", + totalReserveMemSize, segment_size_, segment_block_count_); } // begin must be aligned to segment_size_. // returns the actual range mapped, which may be @@ -459,10 +458,9 @@ struct ExpandableSegment { // return size of 0 indicates OOM SegmentRange map(SegmentRange range) { + // 新的内存申请时,使用降级后的标记 if (realUseAllocPageMem() == used1GPage) { segment_block_count_ = used1GPage; - } else if (realUseAllocPageMem() == used16GPage) { - segment_block_count_ = used16GPage; } auto begin = segmentLeft(range.ptr); auto end = segmentRight(range.ptr + range.size); @@ -473,9 +471,8 @@ struct ExpandableSegment { while (end > handles_.size()) { handles_.emplace_back(c10::nullopt); } - size_t segment_len = end - begin; - long remain_size = range.size; - auto current_segment_size = segment_size_*segment_block_count_; + long remainSize = range.size; + auto needMallocSegmentSize = segment_size_ * segment_block_count_; size_t realEnd = end; for (auto i : c10::irange(begin, end)) { // TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL)); @@ -495,13 +492,16 @@ struct ExpandableSegment { prop.reserve = 0; // 如果是16g的头部块,则进行物理内存的申请。或者如果失败了,则降级到1G size_t headId = getSegmentBlockHead(begin, end, i); - ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remain_size=%zu", headId, - i, current_segment_size, remain_size); - if (headId == i && remain_size >= 0) { - auto status = c10_npu::acl::AclrtMallocPhysical(&handle, current_segment_size, &prop, 0, + ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remainSize=%zu", headId, + i, needMallocSegmentSize, remainSize); + if (headId == i && remainSize >= 0) { + auto status = c10_npu::acl::AclrtMallocPhysical(&handle, needMallocSegmentSize, &prop, 0, {kExtraLargeBuffer}, &segmentUsedPageType); if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) { for (auto j: c10::irange(begin, i)) { + if (handles_.at(i) == c10::nullopt) { + continue; + } auto h = handles_.at(j).value().drvMem_Handle; handles_.at(j) = c10::nullopt; NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); @@ -510,8 +510,8 @@ struct ExpandableSegment { return rangeFromHandles(begin, begin); } NPU_CHECK_ERROR(status, "aclrtMallocPhysical"); - remain_size = remain_size - current_segment_size; - usedSegmentSizes_.push_back(current_segment_size); + remainSize = remainSize - needMallocSegmentSize; + usedSegmentSizes_.push_back(needMallocSegmentSize); segmentHandles.segment_block_head = headId; segmentHandles.drvMem_Handle = handle; segmentHandles.segment_size = segment_size_; @@ -520,7 +520,7 @@ struct ExpandableSegment { handles_.at(i) = segmentHandles; } - if (remain_size < 0) { + if (remainSize <= 0) { realEnd = i; break; } @@ -532,15 +532,13 @@ struct ExpandableSegment { continue; } SegmentHandles segmentHandles = handles_.at(i).value(); - if (segmentHandles.drvMem_Handle != nullptr) { - size_t usedSegmentSizes = segmentHandles.segment_size * segmentHandles.segment_block_count; - NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0, - segmentHandles.drvMem_Handle, 0, getHcclComm())); - start_ptr = start_ptr + usedSegmentSizes; - total_size = total_size + usedSegmentSizes; - } + size_t usedSegmentSizes = segmentHandles.segment_size * segmentHandles.segment_block_count; + NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0, segmentHandles.drvMem_Handle, 0, + getHcclComm())); + start_ptr = start_ptr + usedSegmentSizes; + total_size = total_size + usedSegmentSizes; } - ASCEND_LOGD("NPUCachingAllocator map total_size=%zu", total_size); + ASCEND_LOGD("NPUCachingAllocator map: total_size=%zu", total_size); return SegmentRange(range.ptr, total_size); } @@ -2298,11 +2296,10 @@ private: return c; } } - size_t segment_block_count = used1GPage; + size_t segment_block_count = 1; auto segment_size = pool->is_small ? kSmallBuffer : (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer); - // 此处申请虚拟内存,segment_size是页大小,实际虚拟内存巨大 if (realUseAllocPageMem() == used1GPage) { segment_size = kExtraLargeBuffer; segment_block_count = used1GPage; -- Gitee From 4e894de399a0159e32717067a2f32a780c2e1e73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Wed, 9 Jul 2025 14:32:15 +0800 Subject: [PATCH 7/8] c --- torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 74a3a4ea60..64534743a5 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -619,8 +619,8 @@ private: aclrtDrvMemHandle h = segmentHandles.drvMem_Handle; NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *) ptr_ + segment_size_ * i, getHcclComm())); NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); - ASCEND_LOGD("NPUCachingAllocator unmap: i=%zu,begin=%zu,end=%zu", i, segmentBlockHead, - segmentBlockCount + segmentBlockHead); + ASCEND_LOGD("NPUCachingAllocator unmapHandles: i=%zu,begin=%zu,end=%zu", i, segmentBlockHead, + segmentBlockHead + segmentBlockCount); handles_.at(i) = c10::nullopt; } @@ -657,7 +657,6 @@ private: size_t numSegments(size_t size) { - ASCEND_LOGD("numSegments unmap: size=%zu,segment_size_=%zu,segment_block_count_=%zu,end=%zu", size,segment_size_,segment_block_count_); return (size + segment_size_*segment_block_count_ - 1) / (segment_size_*segment_block_count_); } @@ -670,7 +669,7 @@ private: size_t segmentRight(char *p) { auto size = p - ptr(); - return numSegments(size)*segment_block_count_; + return numSegments(size) * segment_block_count_; } size_t unmapSegmentRight(char *p) @@ -1075,7 +1074,7 @@ size_t CachingAllocatorConfig::parsePageSize(const std::vector &con m_page_size_1g = true; } else if(config[i + 2] == "16g"){ m_page_size_16g = true; - segmentUsedPageType=used16GPage; + segmentUsedPageType = used16GPage; }else { TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE)); } @@ -2326,7 +2325,6 @@ private: TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size, PTA_ERROR(ErrCode::INTERNAL)); TORCH_INTERNAL_ASSERT(!to_map->context_when_allocated); // unmapped blocks should not keep // history - BlockPool &pool = *to_map->pool; auto mapped_range = to_map->expandable_segment_->map(SegmentRange{ to_map->ptr, size }); // failed to map the memory if (mapped_range.size == 0) { @@ -2335,6 +2333,7 @@ private: TORCH_INTERNAL_ASSERT(mapped_range.ptr == to_map->ptr && mapped_range.size >= size, PTA_ERROR(ErrCode::INTERNAL)); + BlockPool &pool = *to_map->pool; pool.unmapped.erase(to_map); to_map->mapped = true; -- Gitee From 0efeb9eb8fd2d3adc6a96da8d873f23673ea73d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B3=E9=BE=99=E9=94=8B?= Date: Wed, 9 Jul 2025 14:59:18 +0800 Subject: [PATCH 8/8] c --- torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 64534743a5..11a8eab6c9 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -420,7 +420,11 @@ bevhavior for allocator tensors that need to be used cross-process. */ struct ExpandableSegment { - ExpandableSegment(int device, aclrtStream stream, size_t size,size_t blockCount) + ExpandableSegment( + int device, + std::optional stream, + size_t size, + size_t blockCount) : device_(device), stream_(stream), max_handles_(0), -- Gitee