diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index e3c3a327be49710fd9a9d7ce5540c320a0cc6746..b81750695fe0338b1e55e1c83274aaf676f9804c 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1554,7 +1554,7 @@ public: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif return block; @@ -1619,7 +1619,7 @@ public: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif } @@ -2434,7 +2434,7 @@ private: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif } diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index 7d5173dec8d19f2f167609a32c5f02b0c199ae6a..660089b0fb600f6c452f67c89766c98ea9660114 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -113,7 +113,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); #endif block->data_ptr = nullptr; @@ -154,7 +154,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); this->last_block = block; this->last_stream = stream; @@ -180,7 +180,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); this->last_block = block; this->last_stream = stream; @@ -204,7 +204,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(this->last_stream)} + this->last_stream } ); } #endif @@ -254,7 +254,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(block_pair.first)} + block_pair.first } ); #endif } diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 30a4280edc8ccb71e202f57ba1901010fd241c49..ef5e7eeb13b7754245ec24318389c66730324991 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -89,6 +89,7 @@ LOAD_FUNCTION(aclrtIpcMemClose) LOAD_FUNCTION(aclrtMemExportToShareableHandle) LOAD_FUNCTION(aclrtMemSetPidToShareableHandle) LOAD_FUNCTION(aclrtMemImportFromShareableHandle) +LOAD_FUNCTION(aclrtStreamGetId) LOAD_FUNCTION(aclrtDeviceGetBareTgid) aclprofStepInfoPtr init_stepinfo() { @@ -1033,5 +1034,16 @@ aclError AclrtDeviceGetBareTgid(int32_t *pid) return func(pid); } +aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id) +{ + typedef aclError(*AclrtStreamGetIdFunc)(aclrtStream, int32_t*); + static AclrtStreamGetIdFunc func = nullptr; + if (func == nullptr) { + func = (AclrtStreamGetIdFunc)GET_FUNC(aclrtStreamGetId); + } + TORCH_CHECK(func, "Failed to find function ", "AclrtStreamGetId", PROF_ERROR(ErrCode::NOT_FOUND)); + return func(stream, stream_id); +} + } // namespace acl } // namespace c10 diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 373aca671f5e5e0a73de73c72826c5c17c229c4f..46dd1180200b0ab598efad49c84251abf2cd0843 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -243,6 +243,8 @@ aclError AclrtMemSetPidToShareableHandle(uint64_t shareableHandle, int32_t *pid, aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t deviceId, aclrtDrvMemHandle *handle); +aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id); + aclError AclrtDeviceGetBareTgid(int32_t *pid); } // namespace acl diff --git a/torch_npu/csrc/profiler/npu_profiler.cpp b/torch_npu/csrc/profiler/npu_profiler.cpp index 295eda9aea1f2a425c21caaf037e4f60713a463e..3678da07550d017db863d53ff2d7d05cf12f4edd 100644 --- a/torch_npu/csrc/profiler/npu_profiler.cpp +++ b/torch_npu/csrc/profiler/npu_profiler.cpp @@ -6,6 +6,7 @@ #include "torch_npu/csrc/core/npu/npu_log.h" #include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/core/npu/interface/AclInterface.h" #include "torch_npu/csrc/profiler/npu_profiler.h" #include "torch_npu/csrc/toolkit/profiler/common/utils.h" @@ -380,6 +381,8 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data) if (!ProfilerMgr::GetInstance()->ReportMemEnable().load()) { return; } + int32_t stream_id; + c10_npu::acl::AclrtStreamGetId(data.stream, &stream_id); ProfilerMgr::GetInstance()->UploadWithLock(std::make_unique( data.ptr, static_cast(Utils::GetClockTime()), @@ -387,7 +390,7 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data) data.total_allocated, data.total_reserved, data.total_active, - data.stream_ptr, + stream_id, data.device_type, data.device_index, data.component_type, diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h index 2127825bc134e0b49178fe00890cdff58011e62c..715660f2e9fd90191e1a0061ab090f7599f730a0 100644 --- a/torch_npu/csrc/profiler/npu_profiler.h +++ b/torch_npu/csrc/profiler/npu_profiler.h @@ -7,6 +7,8 @@ #include +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" #include "torch_npu/csrc/toolkit/profiler/inc/data_reporter.h" #include "torch_npu/csrc/profiler/profiler_mgr.h" #include "torch_npu/csrc/profiler/mstx_mgr.h" @@ -55,7 +57,7 @@ struct MemoryUsage { int64_t total_allocated{0}; int64_t total_reserved{0}; int64_t total_active{0}; - int64_t stream_ptr{0}; + aclrtStream stream{nullptr}; }; struct ExperimentalConfig {