diff --git a/test/cpp_extensions/pluggable_allocator_extensions.cpp b/test/cpp_extensions/pluggable_allocator_extensions.cpp index c17077116bfc4bda401e303ec7e93799f8422c64..4bf1ff1099571d573da048340ca2ec7e3c6500b1 100644 --- a/test/cpp_extensions/pluggable_allocator_extensions.cpp +++ b/test/cpp_extensions/pluggable_allocator_extensions.cpp @@ -14,7 +14,7 @@ static bool useflag = false; void* my_malloc(ssize_t size, int device, aclrtStream stream) { void *ptr; - aclrtMallocAlign32(&ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMallocAlign32(&ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}); std::cout<<"alloc ptr = "<(StatType::NUM_TYPES)>; @@ -140,34 +146,86 @@ void update_stat_array(StatArray &stat_array, int64_t amount, const StatTypes &s [&stat_array, amount](size_t stat_type) { update_stat(stat_array[stat_type], amount); }); } -bool IsMallocPage1GMem(bool is_small_pool) + + +bool IsSupport1GVersion() { static bool is_support_page_size_1g = []() { - if (!c10_npu::NPUCachingAllocator::isConfig1GPageSizeEnable()) { + if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) { + TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. " + "Using the HUGE_MEM memory page allocation method may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g " + "or PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " + "but the current CANN version does not support this feature. " + "Please upgrade the CANN package version."); return false; } - if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) { + if (!IsGteDriverVersion(kMinDriverVersion)) { TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. " "Using the HUGE_MEM memory page allocation method may result in performance degradation. " - "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g " + "or PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " "but the current driver version does not support this feature. " + "Please upgrade the HDK(driver) package version."); + return false; + } + return true; + }(); + return is_support_page_size_1g; +} + +bool IsSupport16GVersion() +{ + static bool is_support_page_size_16g = []() { + if (!IsGteCANNVersion(kMinCannVersionSupport16G, kCannModule)) { + TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. " + "Using the ACL_HBM_MEM_HUGE1G memory page allocation method may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " + "but the current CANN version does not support this feature. " "Please upgrade the CANN package version."); return false; } - if (!IsGteDriverVersion(kMinDriverVersion)) { - TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. " + if (!IsGteDriverVersion(kMinDriverVersionSupport16G)) { + TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. " "Using the HUGE_MEM memory page allocation method may result in performance degradation. " - "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " "but the current driver version does not support this feature. " "Please upgrade the HDK(driver) package version."); return false; } return true; }(); + return is_support_page_size_16g; +} - return !is_small_pool && is_support_page_size_1g; +int realUseAllocPageMem(){ + if(c10_npu::NPUCachingAllocator::isConfig16GPageSizeEnable()){ + + // 满足16g版本并且有足够的16g大页块申请 + if (IsSupport16GVersion() && segmentUsedPageType == used16GPage) { + return used16GPage; + }; + if(IsSupport1GVersion()){ + TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. " + "Using the ACL_HBM_MEM_HUGE1G memory page allocation method may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, " + "but the current driver and CANN version does not support this feature. " + "Please upgrade the HDK(driver) and CANN package version." + "The minimum version of the HDK (driver) needs to be greater than 25.1.RC1 " + "and The minimum version of the CANN needs to be greater than 8.2.RC1." + "Or there may not be enough 16g virtual memory blocks allocated."); + segmentUsedPageType = used1GPage; + return used1GPage; + }; + } + if(c10_npu::NPUCachingAllocator::isConfig1GPageSizeEnable()){ + if(IsSupport1GVersion()){ + return used1GPage; + }; + } + return 0; } struct Block; @@ -278,6 +336,13 @@ struct SegmentRange { }; +struct SegmentHandles { + aclrtDrvMemHandle drvMem_Handle; + size_t segment_size; + size_t segment_block_count; + size_t segment_block_head; +}; + /* Note [Expandable Segments] Rationale @@ -355,12 +420,17 @@ bevhavior for allocator tensors that need to be used cross-process. */ struct ExpandableSegment { - ExpandableSegment(int device, aclrtStream stream, size_t size) + ExpandableSegment( + int device, + std::optional stream, + size_t size, + size_t blockCount) : device_(device), stream_(stream), max_handles_(0), // 2MB for small pool, 20MB for large pool - segment_size_(size) + segment_size_(size), + segment_block_count_(blockCount) { size_t device_free; size_t device_total; @@ -380,11 +450,11 @@ struct ExpandableSegment { max_handles_ = numSegments(kLargePoolVirAddrSize); } } - - NPU_CHECK_ERROR( - c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm())); - ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, segment_size=%zu", - segment_size_ * max_handles_, segment_size_); + size_t totalReserveMemSize = segment_block_count_ * segment_size_ * max_handles_; + NPU_CHECK_ERROR(c10_npu::acl::AclrtReserveMemAddress(&ptr_, totalReserveMemSize, 0, nullptr, 1, getHcclComm())); + ASCEND_LOGD( + "NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu,segment_size=%zu,segment_block_count_=%zu", + totalReserveMemSize, segment_size_, segment_block_count_); } // begin must be aligned to segment_size_. // returns the actual range mapped, which may be @@ -392,6 +462,10 @@ struct ExpandableSegment { // return size of 0 indicates OOM SegmentRange map(SegmentRange range) { + // 新的内存申请时,使用降级后的标记 + if (realUseAllocPageMem() == used1GPage) { + segment_block_count_ = used1GPage; + } auto begin = segmentLeft(range.ptr); auto end = segmentRight(range.ptr + range.size); TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr, PTA_ERROR(ErrCode::PTR)); @@ -401,35 +475,75 @@ struct ExpandableSegment { while (end > handles_.size()) { handles_.emplace_back(c10::nullopt); } + long remainSize = range.size; + auto needMallocSegmentSize = segment_size_ * segment_block_count_; + size_t realEnd = end; for (auto i : c10::irange(begin, end)) { - TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL)); +// TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL)); + SegmentHandles segmentHandles; aclrtDrvMemHandle handle = nullptr; aclrtPhysicalMemProp prop = {}; prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; - prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE; + prop.memAttr = ACL_HBM_MEM_HUGE; + if (realUseAllocPageMem() == used1GPage) { + prop.memAttr = ACL_HBM_MEM_HUGE1G; + } else if (realUseAllocPageMem() == used16GPage) { + prop.memAttr = ACL_HBM_MEM_HUGE16G; + } prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; prop.location.id = static_cast(device_); prop.reserve = 0; - auto status = c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0); - if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) { - for (auto j : c10::irange(begin, i)) { - auto h = handles_.at(j).value(); - handles_.at(j) = c10::nullopt; - NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + // 如果是16g的头部块,则进行物理内存的申请。或者如果失败了,则降级到1G + size_t headId = getSegmentBlockHead(begin, end, i); + ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remainSize=%zu", headId, + i, needMallocSegmentSize, remainSize); + if (headId == i && remainSize >= 0) { + auto status = c10_npu::acl::AclrtMallocPhysical(&handle, needMallocSegmentSize, &prop, 0, + {kExtraLargeBuffer}, &segmentUsedPageType); + if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) { + for (auto j: c10::irange(begin, i)) { + if (handles_.at(i) == c10::nullopt) { + continue; + } + auto h = handles_.at(j).value().drvMem_Handle; + handles_.at(j) = c10::nullopt; + NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + } + trimHandles(); + return rangeFromHandles(begin, begin); } - trimHandles(); - return rangeFromHandles(begin, begin); + NPU_CHECK_ERROR(status, "aclrtMallocPhysical"); + remainSize = remainSize - needMallocSegmentSize; + usedSegmentSizes_.push_back(needMallocSegmentSize); + segmentHandles.segment_block_head = headId; + segmentHandles.drvMem_Handle = handle; + segmentHandles.segment_size = segment_size_; + segmentHandles.segment_block_count = segmentUsedPageType; + segment_block_count_ = segmentUsedPageType; + handles_.at(i) = segmentHandles; + } + + if (remainSize <= 0) { + realEnd = i; + break; } - NPU_CHECK_ERROR(status, "aclrtMallocPhysical"); - handles_.at(i) = handle; } - for (auto i : c10::irange(begin, end)) { - NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0, - handles_.at(i).value(), 0, getHcclComm())); + auto start_ptr = range.ptr; + size_t total_size = 0; + for (auto i: c10::irange(begin, end)) { + if (i > realEnd || handles_.at(i) == c10::nullopt) { + continue; + } + SegmentHandles segmentHandles = handles_.at(i).value(); + size_t usedSegmentSizes = segmentHandles.segment_size * segmentHandles.segment_block_count; + NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0, segmentHandles.drvMem_Handle, 0, + getHcclComm())); + start_ptr = start_ptr + usedSegmentSizes; + total_size = total_size + usedSegmentSizes; } - ASCEND_LOGD("NPUCachingAllocator map: segment_size=%zu", segment_size_); - return rangeFromHandles(begin, end); + ASCEND_LOGD("NPUCachingAllocator map: total_size=%zu", total_size); + return SegmentRange(range.ptr, total_size); } // unmaps all the completely empty segment_size_ segments between @@ -437,13 +551,13 @@ struct ExpandableSegment { // and the actual size unmapped (multiple of segment_size_) SegmentRange unmap(SegmentRange range) { - auto begin = segmentRight(range.ptr); + auto begin = unmapSegmentRight(range.ptr); auto end = segmentLeft(range.ptr + range.size); + ASCEND_LOGD("NPUCachingAllocator start to unmap,begin=%zu,end=%zu,size=%zu", begin, end, range.size); if (begin >= end) { return SegmentRange{ range.ptr, 0 }; } - unmapHandles(begin, end); - return rangeFromHandles(begin, end); + return unmapHandles(begin, end); } char *ptr() const @@ -453,7 +567,7 @@ struct ExpandableSegment { size_t size() const { - return max_handles_ * segment_size_; + return max_handles_ * segment_size_ * segment_block_count_; } void setHcclComm(std::shared_ptr hcclComm) @@ -464,7 +578,7 @@ struct ExpandableSegment { segment_size_ * max_handles_, 0, 1)); for (auto i : c10::irange(handles_.size())) { HCCL_CHECK_ERROR(at_npu::hccl::HcclCommActivateCommMemoryFace(hcclComm_->getHcclComm(), - (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value(), 0)); + (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value().drvMem_Handle, 0)); } } @@ -476,7 +590,7 @@ struct ExpandableSegment { } private: - void unmapHandles(size_t begin, size_t end) + SegmentRange unmapHandles(size_t begin, size_t end) { // note: unlike aclrtFree, MemUnmap and MemRelease do // not appear to synchronize in all cases, so we have to wait for the @@ -492,14 +606,37 @@ private: trigger->traceNpuStreamSynchronization(reinterpret_cast(stream_)); } #endif - for (auto i : c10::irange(begin, end)) { - aclrtDrvMemHandle h = handles_.at(i).value(); - handles_.at(i) = c10::nullopt; - NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm())); - NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + size_t beginUnmap = 0; + size_t endUnmap = 0; + for (auto i: c10::irange(begin, end)) { + if (handles_.at(i) == c10::nullopt) { + continue; + } + SegmentHandles segmentHandles = handles_.at(i).value(); + size_t segmentBlockCount = segmentHandles.segment_block_count; + size_t segmentBlockHead = segmentHandles.segment_block_head; + if (i == segmentBlockHead && segmentBlockCount + i <= end) { + if (beginUnmap == 0) { + beginUnmap = segmentBlockHead; + } + endUnmap = segmentBlockHead + segmentBlockCount; + aclrtDrvMemHandle h = segmentHandles.drvMem_Handle; + NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *) ptr_ + segment_size_ * i, getHcclComm())); + NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h)); + ASCEND_LOGD("NPUCachingAllocator unmapHandles: i=%zu,begin=%zu,end=%zu", i, segmentBlockHead, + segmentBlockHead + segmentBlockCount); + handles_.at(i) = c10::nullopt; + } + } - ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_); trimHandles(); + if (beginUnmap == 0 && endUnmap == 0) { + beginUnmap = begin; + endUnmap = begin; + } + ASCEND_LOGD("NPUCachingAllocator total unmapHandles: beginUnmap=%zu,endUnmap=%zu,segment_size_=%zu", beginUnmap, + endUnmap, segment_size_); + return rangeFromHandles(beginUnmap, endUnmap); } void trimHandles() @@ -524,7 +661,7 @@ private: size_t numSegments(size_t size) { - return (size + segment_size_ - 1) / segment_size_; + return (size + segment_size_*segment_block_count_ - 1) / (segment_size_*segment_block_count_); } size_t segmentLeft(char *p) @@ -536,11 +673,20 @@ private: size_t segmentRight(char *p) { auto size = p - ptr(); - return numSegments(size); + return numSegments(size) * segment_block_count_; } - SegmentRange rangeFromHandles(size_t begin, size_t end) + size_t unmapSegmentRight(char *p) { + auto size = p - ptr(); + return (size + segment_size_- 1) / (segment_size_); + } + + size_t getSegmentBlockHead(size_t begin, size_t end, size_t index) { + return ((index - begin) / segment_block_count_) * segment_block_count_ + begin; + } + + SegmentRange rangeFromHandles(size_t begin, size_t end) { return SegmentRange(ptr() + segment_size_ * begin, segment_size_ * (end - begin)); } @@ -557,7 +703,9 @@ private: void *ptr_{}; size_t max_handles_; size_t segment_size_; - std::vector> handles_; + size_t segment_block_count_; + std::vector usedSegmentSizes_; + std::vector> handles_; std::shared_ptr hcclComm_; }; @@ -776,6 +924,10 @@ public: { return instance().m_page_size_1g; } + static bool page_size_16g_enable() + { + return instance().m_page_size_16g; + } static CachingAllocatorConfig &instance() { @@ -797,6 +949,7 @@ private: bool set_expandable_segments_flag = false; size_t m_base_addr_aligned_size = kAlignRoundLarge; bool m_page_size_1g = false; // 新增1G页配置标志 + bool m_page_size_16g = false; // 新增1G页配置标志 CachingAllocatorConfig() : m_max_split_size(std::numeric_limits::max()), @@ -923,7 +1076,10 @@ size_t CachingAllocatorConfig::parsePageSize(const std::vector &con if (config[i + 2] == "1g") { m_page_size_1g = true; - } else { + } else if(config[i + 2] == "16g"){ + m_page_size_16g = true; + segmentUsedPageType = used16GPage; + }else { TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE)); } return i + 2; // 返回最后处理的索引位置 @@ -987,6 +1143,10 @@ bool isConfig1GPageSizeEnable() return CachingAllocatorConfig::page_size_1g_enable(); } +bool isConfig16GPageSizeEnable() +{ + return CachingAllocatorConfig::page_size_16g_enable(); +} // To prevent the deadlock situation, temporarily release the lock. // // Deadlock Scenario Description: @@ -1230,12 +1390,19 @@ public: process_events(context); } auto size = round_size(orig_size); + + // 如果开启了1g或者16G大页,则默认使用大内存池,进行分配内存,不再根据内存小,而优先选择小内存池的场景了 auto &pool = get_pool(size, stream); - // 开环境变量 大池子放1G内存块 - const size_t alloc_size = IsMallocPage1GMem(pool.is_small) ? - kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer) : - get_allocation_size(size); + // 开环境变量 进行16G和1G的内存对齐,如果没有配置就使用普通内存对齐get_allocation_size + size_t alloc_size = 0; + if (realUseAllocPageMem() == used1GPage) { + alloc_size = kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer); + } else if (realUseAllocPageMem() == used16GPage) { + alloc_size = kExtraLarge16GBuffer * ((size + kExtraLarge16GBuffer - 1) / kExtraLarge16GBuffer); + } else { + alloc_size = get_allocation_size(size); + } AllocParams params(device, size, stream, &pool, alloc_size, stats); params.stat_types = get_stat_types_for_pool(pool); @@ -2132,14 +2299,18 @@ private: return c; } } + size_t segment_block_count = 1; auto segment_size = pool->is_small ? kSmallBuffer : (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer); - // 此处申请虚拟内存,segment_size是页大小,实际虚拟内存巨大 - if (IsMallocPage1GMem(pool->is_small)) { + if (realUseAllocPageMem() == used1GPage) { segment_size = kExtraLargeBuffer; + segment_block_count = used1GPage; + } else if (realUseAllocPageMem() == used16GPage) { + segment_size = kExtraLargeBuffer; + segment_block_count = used16GPage; } - auto segment = new ExpandableSegment(device, stream, segment_size); + auto segment = new ExpandableSegment(device, stream, segment_size, segment_block_count); if (hcclComm_) { segment->setHcclComm(hcclComm_); } @@ -2333,6 +2504,9 @@ private: if (entry.second(stream)) { auto it1 = graph_pools.find(entry.first); TORCH_INTERNAL_ASSERT(it1 != graph_pools.end()); + if (realUseAllocPageMem() > 0) { + return it1->second->large_blocks; + } if (size <= kSmallSize) { return it1->second->small_blocks; } else { @@ -2341,6 +2515,9 @@ private: } } } + if (realUseAllocPageMem() > 0) { + return large_blocks; + } if (size <= kSmallSize) { return small_blocks; } else { @@ -2543,11 +2720,17 @@ private: ptr = active_pool->allocator()->raw_alloc(size); p.err = ptr ? ACL_ERROR_NONE : ACL_ERROR_RT_MEMORY_ALLOCATION; } else { + std::vector alloc_size_level; + // 使用origin size进行1g对齐 + alloc_size_level.push_back( + kExtraLargeBuffer * ((p.size() + kExtraLargeBuffer - 1) / kExtraLargeBuffer)); auto policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST; - if (IsMallocPage1GMem(p.pool->is_small)) { + if (realUseAllocPageMem() == used1GPage) { policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY; + } else if (realUseAllocPageMem() == used16GPage) { + policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY; } - p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy); + p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy, alloc_size_level, &segmentUsedPageType); } if (p.err != ACL_ERROR_NONE) { return false; @@ -3293,7 +3476,7 @@ public: deleteFunc = &uncached_delete; size_t alloc_size = size + 32; NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size, - aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST)); + aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}, &segmentUsedPageType)); ASCEND_LOGD("Without NPUCachingAllocator, malloc by " "AclrtMallocAlign32: size=%zu", alloc_size); @@ -3322,7 +3505,8 @@ public: deleteFunc = &uncached_delete; size_t alloc_size = size + 32 + aligned; NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&realPtr, alloc_size, - aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST)); + aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, + {}, &segmentUsedPageType)); ASCEND_LOGD("Without NPUCachingAllocator, malloc by " "AclrtMallocAlign32: size=%zu", alloc_size); } else { diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index a4e14d2232ab30f7a3cd4e991c904f404b18f6a5..daf1f07ece4230dbf7833a60ce27dcf0783b1ccd 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -445,6 +445,8 @@ bool checkConfigExpandableSegments(); bool isConfig1GPageSizeEnable(); +bool isConfig16GPageSizeEnable(); + } // namespace NPUCachingAllocator } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index fa4e79ff7ba41f798fbd124c5ed727612b5718d0..9d3f8497f023258c2a090602aed87710c734118c 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -128,7 +128,7 @@ public: PTA_ERROR(ErrCode::MEMORY)); aclError err = c10_npu::acl::AclrtMallocAlign32( - &block->data_ptr, block->size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY); + &block->data_ptr, block->size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY, {}, nullptr); if (err != ACL_ERROR_NONE) { return nullptr; } @@ -510,7 +510,8 @@ public: if (size != 0) { size_t alloc_size = size + 32; NPU_CHECK_ERROR( - c10_npu::acl::AclrtMallocAlign32(&dev_ptr, alloc_size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY)); + c10_npu::acl::AclrtMallocAlign32(&dev_ptr, alloc_size, + aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY, {}, nullptr)); } } else { if (size != 0) { diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index b59e9c85c96e2998273953d3d068a3465bd0efde..33e654158004bf6d264be59876a579e28f0b1834 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -433,9 +433,10 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u return func(deviceId, utilizationInfo); } -aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy) { - typedef aclError (*AclrtMallocAlign32)(void**, size_t, aclrtMemMallocPolicy); - static AclrtMallocAlign32 func = (AclrtMallocAlign32)GET_FUNC(aclrtMallocAlign32); +aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy, + std::vector allocSizeLevel, uint32_t *segmentUsedPageType) { + typedef aclError (*AclrtMallocAlign32)(void **, size_t, aclrtMemMallocPolicy); + static AclrtMallocAlign32 func = (AclrtMallocAlign32) GET_FUNC(aclrtMallocAlign32); aclError ret; if (func != nullptr) { ret = func(devPtr, size, policy); @@ -444,13 +445,35 @@ aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy pol ret = aclrtMalloc(devPtr, size, policy); } - if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY)) { + if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY)) { + TORCH_NPU_WARN_ONCE("TThe malloc 16G large-page physical memory failed, " + "so try to malloc 1G large-page physical memory." + "Using the 1G large-page physical memory page may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration " + "is enabled, but the pre-allocated number of 16G large pages is insufficient " + "or 16G large-page memory pre-allocation is not enabled."); + policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY; + // 进行size对齐的调整 + size = allocSizeLevel[0]; + // 16g失败,标志位设置成1 + *segmentUsedPageType = 1; + if (func != nullptr) { + ret = func(devPtr, size, policy); + } else { + TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtMallocAlign32"); + ret = aclrtMalloc(devPtr, size, policy); + } + }; + if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY || + policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY)) { TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory." "Using the 2M memory page may result in performance degradation. " "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is " "enabled, but the pre-allocated number of 1G large pages is insufficient or 1G large-page " "memory pre-allocation is not enabled."); policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST; + size = allocSizeLevel[0]; + *segmentUsedPageType = 1; if (func != nullptr) { ret = func(devPtr, size, policy); } else { @@ -521,16 +544,33 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm) return ret; } -aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop, - uint64_t flags) { - typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle*, size_t, const aclrtPhysicalMemProp*, uint64_t); +aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, + uint64_t flags, std::vector segmentSizeLevel, uint32_t *segmentUsedPageType) { + typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle *, size_t, const aclrtPhysicalMemProp *, uint64_t); static AclrtMallocPhysical func = nullptr; if (func == nullptr) { func = (AclrtMallocPhysical)GET_FUNC(aclrtMallocPhysical); } TORCH_CHECK(func, "Failed to find function ", "aclrtMallocPhysical", PTA_ERROR(ErrCode::NOT_FOUND)); aclError ret = func(handle, size, prop, flags); - if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G)) { + if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE16G)) { + TORCH_NPU_WARN_ONCE("The malloc 16G large-page physical memory failed, " + "so try to malloc 1G large-page physical memory." + "Using the 1G large-page physical memory page may result in performance degradation. " + "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration " + "is enabled, but the pre-allocated number of 16G large pages is insufficient " + "or 16G large-page memory pre-allocation is not enabled."); + aclrtPhysicalMemProp prop_update = {prop->handleType, + prop->allocationType, + ACL_HBM_MEM_HUGE1G, + {prop->location.id, + prop->location.type}, + prop->reserve}; + size = segmentSizeLevel[0]; + *segmentUsedPageType = 1; + ret = func(handle, size, &prop_update, flags); + }; + if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G || prop->memAttr == ACL_HBM_MEM_HUGE16G)) { TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory." "Using the 2M memory page may result in performance degradation. " "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration " @@ -542,6 +582,8 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrt {prop->location.id, prop->location.type}, prop->reserve}; + size = segmentSizeLevel[0]; + *segmentUsedPageType = 1; ret = func(handle, size, &prop_update, flags); } return ret; @@ -819,7 +861,7 @@ aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *ve if (func == nullptr) { func = (aclsysGetCANNVersionFunc)GET_FUNC(aclsysGetCANNVersion); if (func == nullptr) { - return ACL_ERROR_RT_FEATURE_NOT_SUPPORT; + return ACL_ERROR_RT_FEATURE_NOT_SUPPORT; } } diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 9cdad2663bd438107c409c0d0afe542193db6a75..3eae5bb3f020acf39e76a66a76385bb8cf7671c9 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -137,7 +137,8 @@ aclError AclrtDestroyStreamForce(aclrtStream stream); aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *utilizationInfo); -aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy); +aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy, + std::vector allocSizeLevel, uint32_t *segmentUsedPageType); aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status); @@ -148,7 +149,8 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm = nullptr); -aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop, uint64_t flags); +aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, uint64_t flags, + std::vector segmentSizeLevel, uint32_t *segmentUsedPageType); aclError AclrtFreePhysical(aclrtDrvMemHandle handle); diff --git a/torch_npu/csrc/npu/Stress_detect.cpp b/torch_npu/csrc/npu/Stress_detect.cpp index 3fcade819bea6a6f011b04b5a5257c87b002021d..63301c5bc683ae82e3ffa5ad53157a668359354f 100644 --- a/torch_npu/csrc/npu/Stress_detect.cpp +++ b/torch_npu/csrc/npu/Stress_detect.cpp @@ -109,10 +109,12 @@ int StressDetector::perform_stress_detect(int deviceid) uint64_t size = 10; workspaceSize = size << 10 << 10 << 10; // Assume memory size if (workspaceSize > 0) { - auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST, {}, + nullptr); if (ret != ACL_ERROR_NONE) { c10_npu::NPUCachingAllocator::emptyCache(); - ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST, {}, + nullptr); if (ret != ACL_ERROR_NONE) { ASCEND_LOGW("call AclrtMallocAlign32 failed, ERROR : %d. Skip StressDetect.", ret); task_in_progress.store(false); // Task ends