diff --git a/test/cpp_extensions/pluggable_allocator_extensions.cpp b/test/cpp_extensions/pluggable_allocator_extensions.cpp
index c17077116bfc4bda401e303ec7e93799f8422c64..4bf1ff1099571d573da048340ca2ec7e3c6500b1 100644
--- a/test/cpp_extensions/pluggable_allocator_extensions.cpp
+++ b/test/cpp_extensions/pluggable_allocator_extensions.cpp
@@ -14,7 +14,7 @@ static bool useflag = false;
 void* my_malloc(ssize_t size, int device, aclrtStream stream)
 {
     void *ptr;
-    aclrtMallocAlign32(&ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST);
+    aclrtMallocAlign32(&ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {});
     std::cout<<"alloc ptr = "<<ptr<<", size = "<<size<<std::endl;
     useflag = true;
     return ptr;
diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h
index 98b520ba4ac73a4b5072d98fd436edde37b51655..e91f466f4e5a4cf30a965d9a7065db02eeeb246e 100755
--- a/third_party/acl/inc/acl/acl_rt.h
+++ b/third_party/acl/inc/acl/acl_rt.h
@@ -89,6 +89,7 @@ typedef enum aclrtMemMallocPolicy {
     ACL_MEM_MALLOC_NORMAL_ONLY_P2P,
     ACL_MEM_MALLOC_HUGE1G_ONLY,
     ACL_MEM_MALLOC_HUGE1G_ONLY_P2P,
+    ACL_MEM_MALLOC_HUGE16G_ONLY,
     ACL_MEM_TYPE_LOW_BAND_WIDTH   = 0x0100,
     ACL_MEM_TYPE_HIGH_BAND_WIDTH  = 0x1000,
 } aclrtMemMallocPolicy;
@@ -110,6 +111,7 @@ typedef enum aclrtMemAttr {
     ACL_HBM_MEM_P2P_NORMAL,
     ACL_HBM_MEM_HUGE1G,
     ACL_HBM_MEM_P2P_HUGE1G,
+    ACL_HBM_MEM_HUGE16G,
 } aclrtMemAttr;
 
 typedef enum aclrtGroupAttr {
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 74afc22031e5f4ff3cb9464d8ed3b49d0de6bb37..11a8eab6c964355a87d52ea5f9b8ae58c2d43254 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -91,6 +91,7 @@ constexpr size_t kSmallBuffer = 2097152;              // "small" allocations are
 constexpr size_t kLargeBuffer = 20971520;             // "large" allocations may be packed in 20 MiB blocks
 constexpr size_t kLargeBufferForHccl = 134217728;     // "large for hccl" allocations may be packed in 128 MiB blocks
 constexpr size_t kExtraLargeBuffer = 1073741824;      // "extra large" allocations may be packed in 1 GB blocks
+constexpr size_t kExtraLarge16GBuffer = 17179869184;      // "extra large" allocations may be packed in 16 GB blocks
 constexpr size_t kMinLargeAlloc = 10485760;           // allocations between 1 and 10 MiB may use kLargeBuffer
 constexpr size_t kRoundLarge = 2097152;               // round up large allocs to 2 MiB
 constexpr size_t kAlignRoundLarge = 16384;            // round up large allocs to 16 KB
@@ -98,7 +99,12 @@ constexpr size_t kSmallPoolVirAddrSize = 2147483648;  // 2 GB
 constexpr size_t kLargePoolVirAddrSize = 10737418240; // 10 GB
 const std::string kMinCannVersion = "8.1.RC1";        // minimum cann version which supports 1g mem 8.1.RC1
 const std::string kMinDriverVersion = "25.0.RC1";     // minimum driver version which supports 1g mem 25.0.RC1
+const std::string kMinCannVersionSupport16G = "8.2.RC1";        // minimum cann version which supports 16g mem 8.2.RC1
+const std::string kMinDriverVersionSupport16G = "25.1.RC1";     // minimum driver version which supports 16g mem 25.1.RC1
 const std::string kCannModule = "CANN";               // cann module name
+constexpr int used1GPage = 1;               // used 1g to alloc
+constexpr int used16GPage = 16;               // used 16g to alloc
+static uint32_t segmentUsedPageType = 1;  // segment size used
 
 using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
 
@@ -140,34 +146,86 @@ void update_stat_array(StatArray &stat_array, int64_t amount, const StatTypes &s
         [&stat_array, amount](size_t stat_type) { update_stat(stat_array[stat_type], amount); });
 }
 
-bool IsMallocPage1GMem(bool is_small_pool)
+
+
+bool IsSupport1GVersion()
 {
     static bool is_support_page_size_1g = []() {
-        if (!c10_npu::NPUCachingAllocator::isConfig1GPageSizeEnable()) {
+        if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) {
+            TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
+                "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g "
+                "or PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
+                "but the current CANN version does not support this feature. "
+                "Please upgrade the CANN package version.");
             return false;
         }
 
-        if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) {
+        if (!IsGteDriverVersion(kMinDriverVersion)) {
             TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
                 "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
-                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g "
+                "or PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
                 "but the current driver version does not support this feature. "
+                "Please upgrade the HDK(driver) package version.");
+            return false;
+        }
+        return true;
+    }();
+    return is_support_page_size_1g;
+}
+
+bool IsSupport16GVersion()
+{
+    static bool is_support_page_size_16g = []() {
+        if (!IsGteCANNVersion(kMinCannVersionSupport16G, kCannModule)) {
+            TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. "
+                "Using the ACL_HBM_MEM_HUGE1G memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
+                "but the current CANN version does not support this feature. "
                 "Please upgrade the CANN package version.");
             return false;
         }
 
-        if (!IsGteDriverVersion(kMinDriverVersion)) {
-            TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
+        if (!IsGteDriverVersion(kMinDriverVersionSupport16G)) {
+            TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. "
                 "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
-                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
                 "but the current driver version does not support this feature. "
                 "Please upgrade the HDK(driver) package version.");
             return false;
         }
         return true;
     }();
+    return is_support_page_size_16g;
+}
 
-    return !is_small_pool && is_support_page_size_1g;
+int realUseAllocPageMem(){
+    if(c10_npu::NPUCachingAllocator::isConfig16GPageSizeEnable()){
+
+        // 满足16g版本并且有足够的16g大页块申请
+        if (IsSupport16GVersion() && segmentUsedPageType == used16GPage) {
+            return used16GPage;
+        };
+        if(IsSupport1GVersion()){
+            TORCH_NPU_WARN_ONCE("The application for 16G large-page physical memory failed. "
+                "Using the ACL_HBM_MEM_HUGE1G memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration is enabled, "
+                "but the current driver and CANN version does not support this feature. "
+                "Please upgrade the HDK(driver) and CANN package version."
+                "The minimum version of the HDK (driver) needs to be greater than 25.1.RC1 "
+                "and The minimum version of the CANN needs to be greater than 8.2.RC1."
+                "Or there may not be enough 16g virtual memory blocks allocated.");
+            segmentUsedPageType = used1GPage;
+            return used1GPage;
+        };
+    }
+    if(c10_npu::NPUCachingAllocator::isConfig1GPageSizeEnable()){
+        if(IsSupport1GVersion()){
+            return used1GPage;
+        };
+    }
+    return 0;
 }
 
 struct Block;
@@ -278,6 +336,13 @@ struct SegmentRange {
 };
 
 
+struct SegmentHandles {
+    aclrtDrvMemHandle drvMem_Handle;
+    size_t segment_size;
+    size_t segment_block_count;
+    size_t segment_block_head;
+};
+
 /*
 Note [Expandable Segments]
 Rationale
@@ -355,12 +420,17 @@ bevhavior for allocator tensors that need to be used cross-process.
 */
 
 struct ExpandableSegment {
-    ExpandableSegment(int device, aclrtStream stream, size_t size)
+    ExpandableSegment(
+        int device,
+        std::optional<aclrtStream> stream,
+        size_t size,
+        size_t blockCount)
         : device_(device),
           stream_(stream),
           max_handles_(0),
           // 2MB for small pool, 20MB for large pool
-          segment_size_(size)
+          segment_size_(size),
+          segment_block_count_(blockCount)
     {
         size_t device_free;
         size_t device_total;
@@ -380,11 +450,11 @@ struct ExpandableSegment {
                 max_handles_ = numSegments(kLargePoolVirAddrSize);
             }
         }
-
-        NPU_CHECK_ERROR(
-            c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm()));
-        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, segment_size=%zu",
-            segment_size_ * max_handles_, segment_size_);
+        size_t totalReserveMemSize = segment_block_count_ * segment_size_ * max_handles_;
+        NPU_CHECK_ERROR(c10_npu::acl::AclrtReserveMemAddress(&ptr_, totalReserveMemSize, 0, nullptr, 1, getHcclComm()));
+        ASCEND_LOGD(
+            "NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu,segment_size=%zu,segment_block_count_=%zu",
+            totalReserveMemSize, segment_size_, segment_block_count_);
     }
     // begin must be aligned to segment_size_.
     // returns the actual range mapped, which may be
@@ -392,6 +462,10 @@ struct ExpandableSegment {
     // return size of 0 indicates OOM
     SegmentRange map(SegmentRange range)
     {
+        // 新的内存申请时，使用降级后的标记
+        if (realUseAllocPageMem() == used1GPage) {
+            segment_block_count_ = used1GPage;
+        }
         auto begin = segmentLeft(range.ptr);
         auto end = segmentRight(range.ptr + range.size);
         TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr, PTA_ERROR(ErrCode::PTR));
@@ -401,35 +475,75 @@ struct ExpandableSegment {
         while (end > handles_.size()) {
             handles_.emplace_back(c10::nullopt);
         }
+        long remainSize = range.size;
+        auto needMallocSegmentSize = segment_size_ * segment_block_count_;
+        size_t realEnd = end;
         for (auto i : c10::irange(begin, end)) {
-            TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL));
+//             TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL));
+            SegmentHandles segmentHandles;
             aclrtDrvMemHandle handle = nullptr;
             aclrtPhysicalMemProp prop = {};
             prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
             prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-            prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE;
+            prop.memAttr = ACL_HBM_MEM_HUGE;
+            if (realUseAllocPageMem() == used1GPage) {
+                prop.memAttr = ACL_HBM_MEM_HUGE1G;
+            } else if (realUseAllocPageMem() == used16GPage) {
+                prop.memAttr = ACL_HBM_MEM_HUGE16G;
+            }
             prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
             prop.location.id = static_cast<unsigned>(device_);
             prop.reserve = 0;
-            auto status = c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0);
-            if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
-                for (auto j : c10::irange(begin, i)) {
-                    auto h = handles_.at(j).value();
-                    handles_.at(j) = c10::nullopt;
-                    NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+            // 如果是16g的头部块，则进行物理内存的申请。或者如果失败了，则降级到1G
+            size_t headId = getSegmentBlockHead(begin, end, i);
+            ASCEND_LOGD("NPUCachingAllocator map:headId=%zu, i=%zu, current_segment_size=%zu, remainSize=%zu", headId,
+                        i, needMallocSegmentSize, remainSize);
+            if (headId == i && remainSize >= 0) {
+                auto status = c10_npu::acl::AclrtMallocPhysical(&handle, needMallocSegmentSize, &prop, 0,
+                                                                {kExtraLargeBuffer}, &segmentUsedPageType);
+                if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+                    for (auto j: c10::irange(begin, i)) {
+                        if (handles_.at(i) == c10::nullopt) {
+                            continue;
+                        }
+                        auto h = handles_.at(j).value().drvMem_Handle;
+                        handles_.at(j) = c10::nullopt;
+                        NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+                    }
+                    trimHandles();
+                    return rangeFromHandles(begin, begin);
                 }
-                trimHandles();
-                return rangeFromHandles(begin, begin);
+                NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
+                remainSize = remainSize - needMallocSegmentSize;
+                usedSegmentSizes_.push_back(needMallocSegmentSize);
+                segmentHandles.segment_block_head = headId;
+                segmentHandles.drvMem_Handle = handle;
+                segmentHandles.segment_size = segment_size_;
+                segmentHandles.segment_block_count = segmentUsedPageType;
+                segment_block_count_ = segmentUsedPageType;
+                handles_.at(i) = segmentHandles;
+            }
+
+            if (remainSize <= 0) {
+                realEnd = i;
+                break;
             }
-            NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
-            handles_.at(i) = handle;
         }
-        for (auto i : c10::irange(begin, end)) {
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0,
-                handles_.at(i).value(), 0, getHcclComm()));
+        auto start_ptr = range.ptr;
+        size_t total_size = 0;
+        for (auto i: c10::irange(begin, end)) {
+            if (i > realEnd || handles_.at(i) == c10::nullopt) {
+                continue;
+            }
+            SegmentHandles segmentHandles = handles_.at(i).value();
+            size_t usedSegmentSizes = segmentHandles.segment_size * segmentHandles.segment_block_count;
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(start_ptr, usedSegmentSizes, 0, segmentHandles.drvMem_Handle, 0,
+                                                      getHcclComm()));
+            start_ptr = start_ptr + usedSegmentSizes;
+            total_size = total_size + usedSegmentSizes;
         }
-        ASCEND_LOGD("NPUCachingAllocator map: segment_size=%zu", segment_size_);
-        return rangeFromHandles(begin, end);
+        ASCEND_LOGD("NPUCachingAllocator map: total_size=%zu", total_size);
+        return SegmentRange(range.ptr, total_size);
     }
 
     // unmaps all the completely empty segment_size_ segments between
@@ -437,13 +551,13 @@ struct ExpandableSegment {
     // and the actual size unmapped (multiple of segment_size_)
     SegmentRange unmap(SegmentRange range)
     {
-        auto begin = segmentRight(range.ptr);
+        auto begin = unmapSegmentRight(range.ptr);
         auto end = segmentLeft(range.ptr + range.size);
+        ASCEND_LOGD("NPUCachingAllocator start to unmap,begin=%zu,end=%zu,size=%zu", begin, end, range.size);
         if (begin >= end) {
             return SegmentRange{ range.ptr, 0 };
         }
-        unmapHandles(begin, end);
-        return rangeFromHandles(begin, end);
+        return unmapHandles(begin, end);
     }
 
     char *ptr() const
@@ -453,7 +567,7 @@ struct ExpandableSegment {
 
     size_t size() const
     {
-        return max_handles_ * segment_size_;
+        return max_handles_ * segment_size_ * segment_block_count_;
     }
 
     void setHcclComm(std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
@@ -464,7 +578,7 @@ struct ExpandableSegment {
             segment_size_ * max_handles_, 0, 1));
         for (auto i : c10::irange(handles_.size())) {
             HCCL_CHECK_ERROR(at_npu::hccl::HcclCommActivateCommMemoryFace(hcclComm_->getHcclComm(),
-                (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value(), 0));
+                 (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value().drvMem_Handle, 0));
         }
     }
 
@@ -476,7 +590,7 @@ struct ExpandableSegment {
     }
 
 private:
-    void unmapHandles(size_t begin, size_t end)
+    SegmentRange unmapHandles(size_t begin, size_t end)
     {
         // note: unlike aclrtFree, MemUnmap and MemRelease do
         // not appear to synchronize in all cases, so we have to wait for the
@@ -492,14 +606,37 @@ private:
             trigger->traceNpuStreamSynchronization(reinterpret_cast<uintptr_t>(stream_));
         }
 #endif
-        for (auto i : c10::irange(begin, end)) {
-            aclrtDrvMemHandle h = handles_.at(i).value();
-            handles_.at(i) = c10::nullopt;
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm()));
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+        size_t beginUnmap = 0;
+        size_t endUnmap = 0;
+        for (auto i: c10::irange(begin, end)) {
+            if (handles_.at(i) == c10::nullopt) {
+                continue;
+            }
+            SegmentHandles segmentHandles = handles_.at(i).value();
+            size_t segmentBlockCount = segmentHandles.segment_block_count;
+            size_t segmentBlockHead = segmentHandles.segment_block_head;
+            if (i == segmentBlockHead && segmentBlockCount + i <= end) {
+                if (beginUnmap == 0) {
+                    beginUnmap = segmentBlockHead;
+                }
+                endUnmap = segmentBlockHead + segmentBlockCount;
+                aclrtDrvMemHandle h = segmentHandles.drvMem_Handle;
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *) ptr_ + segment_size_ * i, getHcclComm()));
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+                ASCEND_LOGD("NPUCachingAllocator unmapHandles: i=%zu,begin=%zu,end=%zu", i, segmentBlockHead,
+                            segmentBlockHead + segmentBlockCount);
+                handles_.at(i) = c10::nullopt;
+            }
+
         }
-        ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_);
         trimHandles();
+        if (beginUnmap == 0 && endUnmap == 0) {
+            beginUnmap = begin;
+            endUnmap = begin;
+        }
+        ASCEND_LOGD("NPUCachingAllocator total unmapHandles: beginUnmap=%zu,endUnmap=%zu,segment_size_=%zu", beginUnmap,
+                    endUnmap, segment_size_);
+        return rangeFromHandles(beginUnmap, endUnmap);
     }
 
     void trimHandles()
@@ -524,7 +661,7 @@ private:
 
     size_t numSegments(size_t size)
     {
-        return (size + segment_size_ - 1) / segment_size_;
+        return (size + segment_size_*segment_block_count_ - 1) / (segment_size_*segment_block_count_);
     }
 
     size_t segmentLeft(char *p)
@@ -536,11 +673,20 @@ private:
     size_t segmentRight(char *p)
     {
         auto size = p - ptr();
-        return numSegments(size);
+        return numSegments(size) * segment_block_count_;
     }
 
-    SegmentRange rangeFromHandles(size_t begin, size_t end)
+    size_t unmapSegmentRight(char *p)
     {
+        auto size = p - ptr();
+        return (size + segment_size_- 1) / (segment_size_);
+    }
+
+    size_t getSegmentBlockHead(size_t begin, size_t end, size_t index) {
+        return ((index - begin) / segment_block_count_) * segment_block_count_ + begin;
+    }
+
+    SegmentRange rangeFromHandles(size_t begin, size_t end) {
         return SegmentRange(ptr() + segment_size_ * begin, segment_size_ * (end - begin));
     }
 
@@ -557,7 +703,9 @@ private:
     void *ptr_{};
     size_t max_handles_;
     size_t segment_size_;
-    std::vector<c10::optional<aclrtDrvMemHandle>> handles_;
+    size_t segment_block_count_;
+    std::vector<size_t> usedSegmentSizes_;
+    std::vector<c10::optional<SegmentHandles>> handles_;
     std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
 };
 
@@ -776,6 +924,10 @@ public:
     {
         return instance().m_page_size_1g;
     }
+    static bool page_size_16g_enable()
+    {
+        return instance().m_page_size_16g;
+    }
 
     static CachingAllocatorConfig &instance()
     {
@@ -797,6 +949,7 @@ private:
     bool set_expandable_segments_flag = false;
     size_t m_base_addr_aligned_size = kAlignRoundLarge;
     bool m_page_size_1g = false; // 新增1G页配置标志
+    bool m_page_size_16g = false; // 新增1G页配置标志
 
     CachingAllocatorConfig()
         : m_max_split_size(std::numeric_limits<size_t>::max()),
@@ -923,7 +1076,10 @@ size_t CachingAllocatorConfig::parsePageSize(const std::vector<std::string> &con
 
     if (config[i + 2] == "1g") {
         m_page_size_1g = true;
-    } else {
+    } else if(config[i + 2] == "16g"){
+        m_page_size_16g = true;
+        segmentUsedPageType = used16GPage;
+    }else {
         TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE));
     }
     return i + 2; // 返回最后处理的索引位置
@@ -987,6 +1143,10 @@ bool isConfig1GPageSizeEnable()
     return CachingAllocatorConfig::page_size_1g_enable();
 }
 
+bool isConfig16GPageSizeEnable()
+{
+    return CachingAllocatorConfig::page_size_16g_enable();
+}
 // To prevent the deadlock situation, temporarily release the lock.
 //
 // Deadlock Scenario Description:
@@ -1230,12 +1390,19 @@ public:
             process_events(context);
         }
         auto size = round_size(orig_size);
+
+        // 如果开启了1g或者16G大页,则默认使用大内存池，进行分配内存，不再根据内存小，而优先选择小内存池的场景了
         auto &pool = get_pool(size, stream);
 
-        // 开环境变量 大池子放1G内存块
-        const size_t alloc_size = IsMallocPage1GMem(pool.is_small) ?
-            kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer) :
-            get_allocation_size(size);
+        // 开环境变量 进行16G和1G的内存对齐，如果没有配置就使用普通内存对齐get_allocation_size
+        size_t alloc_size = 0;
+        if (realUseAllocPageMem() == used1GPage) {
+            alloc_size = kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer);
+        } else if (realUseAllocPageMem() == used16GPage) {
+            alloc_size = kExtraLarge16GBuffer * ((size + kExtraLarge16GBuffer - 1) / kExtraLarge16GBuffer);
+        } else {
+            alloc_size = get_allocation_size(size);
+        }
         AllocParams params(device, size, stream, &pool, alloc_size, stats);
         params.stat_types = get_stat_types_for_pool(pool);
 
@@ -2132,14 +2299,18 @@ private:
                 return c;
             }
         }
+        size_t segment_block_count = 1;
         auto segment_size = pool->is_small ?
             kSmallBuffer :
             (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer);
-        // 此处申请虚拟内存，segment_size是页大小，实际虚拟内存巨大
-        if (IsMallocPage1GMem(pool->is_small)) {
+        if (realUseAllocPageMem() == used1GPage) {
             segment_size = kExtraLargeBuffer;
+            segment_block_count = used1GPage;
+        } else if (realUseAllocPageMem() == used16GPage) {
+            segment_size = kExtraLargeBuffer;
+            segment_block_count = used16GPage;
         }
-        auto segment = new ExpandableSegment(device, stream, segment_size);
+        auto segment = new ExpandableSegment(device, stream, segment_size, segment_block_count);
         if (hcclComm_) {
             segment->setHcclComm(hcclComm_);
         }
@@ -2333,6 +2504,9 @@ private:
                 if (entry.second(stream)) {
                     auto it1 = graph_pools.find(entry.first);
                     TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
+                    if (realUseAllocPageMem() > 0) {
+                        return it1->second->large_blocks;
+                    }
                     if (size <= kSmallSize) {
                         return it1->second->small_blocks;
                     } else {
@@ -2341,6 +2515,9 @@ private:
                 }
             }
         }
+        if (realUseAllocPageMem() > 0) {
+            return large_blocks;
+        }
         if (size <= kSmallSize) {
             return small_blocks;
         } else {
@@ -2543,11 +2720,17 @@ private:
                 ptr = active_pool->allocator()->raw_alloc(size);
                 p.err = ptr ? ACL_ERROR_NONE : ACL_ERROR_RT_MEMORY_ALLOCATION;
             } else {
+                std::vector <size_t> alloc_size_level;
+                // 使用origin size进行1g对齐
+                alloc_size_level.push_back(
+                    kExtraLargeBuffer * ((p.size() + kExtraLargeBuffer - 1) / kExtraLargeBuffer));
                 auto policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST;
-                if (IsMallocPage1GMem(p.pool->is_small)) {
+                if (realUseAllocPageMem() == used1GPage) {
                     policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY;
+                } else if (realUseAllocPageMem() == used16GPage) {
+                    policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY;
                 }
-                p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy);
+                p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy, alloc_size_level, &segmentUsedPageType);
             }
             if (p.err != ACL_ERROR_NONE) {
                 return false;
@@ -3293,7 +3476,7 @@ public:
                 deleteFunc = &uncached_delete;
                 size_t alloc_size = size + 32;
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size,
-                    aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST));
+                    aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST, {}, &segmentUsedPageType));
                 ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
                     "AclrtMallocAlign32: size=%zu",
                     alloc_size);
@@ -3322,7 +3505,8 @@ public:
                 deleteFunc = &uncached_delete;
                 size_t alloc_size = size + 32 + aligned;
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&realPtr, alloc_size,
-                                                                 aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST));
+                                                                 aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST,
+                                                                 {}, &segmentUsedPageType));
                 ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
                             "AclrtMallocAlign32: size=%zu", alloc_size);
             } else {
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index a4e14d2232ab30f7a3cd4e991c904f404b18f6a5..daf1f07ece4230dbf7833a60ce27dcf0783b1ccd 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -445,6 +445,8 @@ bool checkConfigExpandableSegments();
 
 bool isConfig1GPageSizeEnable();
 
+bool isConfig16GPageSizeEnable();
+
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
 
diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
index fa4e79ff7ba41f798fbd124c5ed727612b5718d0..9d3f8497f023258c2a090602aed87710c734118c 100644
--- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
@@ -128,7 +128,7 @@ public:
                 PTA_ERROR(ErrCode::MEMORY));
 
             aclError err = c10_npu::acl::AclrtMallocAlign32(
-                &block->data_ptr, block->size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY);
+                &block->data_ptr, block->size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY, {}, nullptr);
             if (err != ACL_ERROR_NONE) {
                 return nullptr;
             }
@@ -510,7 +510,8 @@ public:
             if (size != 0) {
                 size_t alloc_size = size + 32;
                 NPU_CHECK_ERROR(
-                    c10_npu::acl::AclrtMallocAlign32(&dev_ptr, alloc_size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY));
+                    c10_npu::acl::AclrtMallocAlign32(&dev_ptr, alloc_size,
+                                                     aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_ONLY, {}, nullptr));
             }
         } else {
             if (size != 0) {
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index b59e9c85c96e2998273953d3d068a3465bd0efde..33e654158004bf6d264be59876a579e28f0b1834 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -433,9 +433,10 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u
     return func(deviceId, utilizationInfo);
 }
 
-aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy) {
-    typedef aclError (*AclrtMallocAlign32)(void**, size_t, aclrtMemMallocPolicy);
-    static AclrtMallocAlign32 func = (AclrtMallocAlign32)GET_FUNC(aclrtMallocAlign32);
+aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy,
+                            std::vector <size_t> allocSizeLevel, uint32_t *segmentUsedPageType) {
+    typedef aclError (*AclrtMallocAlign32)(void **, size_t, aclrtMemMallocPolicy);
+    static AclrtMallocAlign32 func = (AclrtMallocAlign32) GET_FUNC(aclrtMallocAlign32);
     aclError ret;
     if (func != nullptr) {
         ret = func(devPtr, size, policy);
@@ -444,13 +445,35 @@ aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy pol
         ret = aclrtMalloc(devPtr, size, policy);
     }
 
-    if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY)) {
+    if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY)) {
+        TORCH_NPU_WARN_ONCE("TThe malloc 16G large-page physical memory failed, "
+                            "so try to malloc 1G large-page physical memory."
+                            "Using the 1G large-page physical memory page may result in performance degradation. "
+                            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration "
+                            "is enabled, but the pre-allocated number of 16G large pages is insufficient "
+                            "or 16G large-page memory pre-allocation is not enabled.");
+        policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY;
+        // 进行size对齐的调整
+        size = allocSizeLevel[0];
+        // 16g失败,标志位设置成1
+        *segmentUsedPageType = 1;
+        if (func != nullptr) {
+            ret = func(devPtr, size, policy);
+        } else {
+            TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtMallocAlign32");
+            ret = aclrtMalloc(devPtr, size, policy);
+        }
+    };
+    if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE16G_ONLY ||
+                                  policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY)) {
         TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory."
                             "Using the 2M memory page may result in performance degradation. "
                             "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is "
                             "enabled, but the pre-allocated number of 1G large pages is insufficient or 1G large-page "
                             "memory pre-allocation is not enabled.");
         policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST;
+        size = allocSizeLevel[0];
+        *segmentUsedPageType = 1;
         if (func != nullptr) {
             ret = func(devPtr, size, policy);
         } else {
@@ -521,16 +544,33 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm)
     return ret;
 }
 
-aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop,
-    uint64_t flags) {
-    typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle*, size_t, const aclrtPhysicalMemProp*, uint64_t);
+aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop,
+                             uint64_t flags, std::vector <size_t> segmentSizeLevel, uint32_t *segmentUsedPageType) {
+    typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle *, size_t, const aclrtPhysicalMemProp *, uint64_t);
     static AclrtMallocPhysical func = nullptr;
     if (func == nullptr) {
         func = (AclrtMallocPhysical)GET_FUNC(aclrtMallocPhysical);
     }
     TORCH_CHECK(func, "Failed to find function ", "aclrtMallocPhysical", PTA_ERROR(ErrCode::NOT_FOUND));
     aclError ret = func(handle, size, prop, flags);
-    if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G)) {
+    if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE16G)) {
+        TORCH_NPU_WARN_ONCE("The malloc 16G large-page physical memory failed, "
+                            "so try to malloc 1G large-page physical memory."
+                            "Using the 1G large-page physical memory page may result in performance degradation. "
+                            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:16g configuration "
+                            "is enabled, but the pre-allocated number of 16G large pages is insufficient "
+                            "or 16G large-page memory pre-allocation is not enabled.");
+        aclrtPhysicalMemProp prop_update = {prop->handleType,
+                                            prop->allocationType,
+                                            ACL_HBM_MEM_HUGE1G,
+                                            {prop->location.id,
+                                             prop->location.type},
+                                            prop->reserve};
+        size = segmentSizeLevel[0];
+        *segmentUsedPageType = 1;
+        ret = func(handle, size, &prop_update, flags);
+    };
+    if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G || prop->memAttr == ACL_HBM_MEM_HUGE16G)) {
         TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory."
                             "Using the 2M memory page may result in performance degradation. "
                             "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration "
@@ -542,6 +582,8 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrt
                                             {prop->location.id,
                                              prop->location.type},
                                             prop->reserve};
+        size = segmentSizeLevel[0];
+        *segmentUsedPageType = 1;
         ret = func(handle, size, &prop_update, flags);
     }
     return ret;
@@ -819,7 +861,7 @@ aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *ve
     if (func == nullptr) {
         func = (aclsysGetCANNVersionFunc)GET_FUNC(aclsysGetCANNVersion);
         if (func == nullptr) {
-        return ACL_ERROR_RT_FEATURE_NOT_SUPPORT;
+            return ACL_ERROR_RT_FEATURE_NOT_SUPPORT;
         }
     }
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index 9cdad2663bd438107c409c0d0afe542193db6a75..3eae5bb3f020acf39e76a66a76385bb8cf7671c9 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -137,7 +137,8 @@ aclError AclrtDestroyStreamForce(aclrtStream stream);
 
 aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *utilizationInfo);
 
-aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy);
+aclError AclrtMallocAlign32(void **devPtr, size_t &size, aclrtMemMallocPolicy policy,
+                            std::vector<size_t> allocSizeLevel, uint32_t *segmentUsedPageType);
 
 aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status);
 
@@ -148,7 +149,8 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo
 
 aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm = nullptr);
 
-aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop, uint64_t flags);
+aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t &size, const aclrtPhysicalMemProp *prop, uint64_t flags,
+                             std::vector <size_t> segmentSizeLevel, uint32_t *segmentUsedPageType);
 
 aclError AclrtFreePhysical(aclrtDrvMemHandle handle);
 
diff --git a/torch_npu/csrc/npu/Stress_detect.cpp b/torch_npu/csrc/npu/Stress_detect.cpp
index 3fcade819bea6a6f011b04b5a5257c87b002021d..63301c5bc683ae82e3ffa5ad53157a668359354f 100644
--- a/torch_npu/csrc/npu/Stress_detect.cpp
+++ b/torch_npu/csrc/npu/Stress_detect.cpp
@@ -109,10 +109,12 @@ int StressDetector::perform_stress_detect(int deviceid)
     uint64_t size = 10;
     workspaceSize = size << 10 << 10 << 10;  // Assume memory size
     if (workspaceSize > 0) {
-        auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+        auto ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST, {},
+                                                    nullptr);
         if (ret != ACL_ERROR_NONE) {
             c10_npu::NPUCachingAllocator::emptyCache();
-            ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+            ret = c10_npu::acl::AclrtMallocAlign32(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST, {},
+                                                   nullptr);
             if (ret != ACL_ERROR_NONE) {
                 ASCEND_LOGW("call AclrtMallocAlign32 failed, ERROR : %d. Skip StressDetect.", ret);
                 task_in_progress.store(false); // Task ends