From e1dfca93af4423706fc704df645f16060f856c8e Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Thu, 28 Aug 2025 21:01:29 +0800 Subject: [PATCH 1/2] hs --- torch_npu/csrc/aten/common/CopyKernel.cpp | 2 +- torch_npu/csrc/aten/common/InnerNpuNativeFunction.h | 8 ++++++++ torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp | 2 +- torch_npu/csrc/framework/OpParamMaker.cpp | 8 ++++++-- torch_npu/csrc/framework/utils/CalcuOpUtil.cpp | 9 +++++++-- 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index ef624a7b36..65f5e39eb7 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -98,7 +98,7 @@ void copy_between_host_and_device( auto ret = CalcuOpUtil::LaunchAsyncCopyTaskWithModeSwitch(dst, nbytes, src, nbytes, kind); NPU_CHECK_ERROR(ret); ASCEND_LOGD("non_blocking copy without StreamSynchronize."); - void* ptr = torch_npu::utils::is_npu(dst) ? src.storage().mutable_data() : dst.storage().mutable_data(); + void* ptr = torch_npu::utils::is_npu(dst) ? get_base_data_ptr(src) : get_base_data_ptr(dst); NPU_CHECK_ERROR(CachingHostAllocator_recordEvent(ptr, kind, stream), "aclrtSynchronizeStreamWithTimeout"); } else { aclError error = c10_npu::acl::AclrtSynchronizeStreamWithTimeout(stream); diff --git a/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h b/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h index 56727a5299..234f361fba 100644 --- a/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h +++ b/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h @@ -20,6 +20,14 @@ at::Tensor matmul_by_bmmV2(const at::Tensor& tensor1, const at::Tensor& tensor2) */ void npu_fast_reshape_(at::Tensor& tensor); +inline void* get_base_data_ptr(const at::Tensor &t) +{ + if (!t.is_view()) { + return t.data_ptr(); + } + return t._base().data_ptr(); +} + } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp index b4d5909158..4b999c937b 100644 --- a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp +++ b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp @@ -44,7 +44,7 @@ void copy_between_host_and_device_opapi(at::Tensor& dst, const at::Tensor& src, auto ret = CalcuOpUtil::LaunchAsyncCopyTaskWithModeSwitch(dst, nbytes, src, nbytes, kind); NPU_CHECK_ERROR(ret); ASCEND_LOGD("non_blocking copy without StreamSynchronize."); - void* ptr = torch_npu::utils::is_npu(dst) ? src.storage().mutable_data() : dst.storage().mutable_data(); + void* ptr = torch_npu::utils::is_npu(dst) ? get_base_data_ptr(src) : get_base_data_ptr(dst); NPU_CHECK_ERROR(CachingHostAllocator_recordEvent(ptr, kind, stream), "aclrtSynchronizeStreamWithTimeout"); } else { aclError error = aclrtSynchronizeStream(stream); diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index 3abd06421a..e06d68eedc 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -451,8 +451,12 @@ int MemcopyAsyncFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) { auto cur_paras = static_cast(in->paramVal); logger->debug("MemcopyAsyncFunc Run."); - aclError ret = - aclrtMemcpyAsync(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); + aclError ret; + if (c10_npu::acl::AclrtMemcpyAsyncWithConditionExist() && cur_paras->kind == aclrtMemcpyKind::ACL_MEMCPY_DEVICE_TO_HOST) { + ret = c10_npu::acl::AclrtMemcpyAsyncWithCondition(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); + } else { + ret = aclrtMemcpyAsync(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); + } if (ret != ACL_ERROR_NONE) { auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); if (ret_temp != ACL_ERROR_NONE) { diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp index 5aaaf2b809..2a073c8d61 100644 --- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp +++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp @@ -118,8 +118,13 @@ static std::unordered_map aclError AclrtMemcpyAsyncParamCheck( void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind, aclrtStream stream) { - auto ret = aclrtMemcpyAsync(dst, destMax, src, count, kind, stream); - return ret; + if (c10_npu::acl::AclrtMemcpyAsyncWithConditionExist() && kind == aclrtMemcpyKind::ACL_MEMCPY_DEVICE_TO_HOST) { + auto ret = c10_npu::acl::AclrtMemcpyAsyncWithCondition(dst, destMax, src, count, kind, stream); + return ret; + } else { + auto ret = aclrtMemcpyAsync(dst, destMax, src, count, kind, stream); + return ret; + } } aclError AclrtMemcpyParamCheck(void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind) -- Gitee From 803384a70e3c0b662e3150f171f5447421b07105 Mon Sep 17 00:00:00 2001 From: SCh-zx <1325467101@qq.com> Date: Mon, 1 Sep 2025 15:08:14 +0800 Subject: [PATCH 2/2] 1 --- torch_npu/csrc/aten/common/CopyKernel.cpp | 2 +- torch_npu/csrc/aten/common/InnerNpuNativeFunction.h | 8 -------- torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp | 2 +- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index 65f5e39eb7..ef624a7b36 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -98,7 +98,7 @@ void copy_between_host_and_device( auto ret = CalcuOpUtil::LaunchAsyncCopyTaskWithModeSwitch(dst, nbytes, src, nbytes, kind); NPU_CHECK_ERROR(ret); ASCEND_LOGD("non_blocking copy without StreamSynchronize."); - void* ptr = torch_npu::utils::is_npu(dst) ? get_base_data_ptr(src) : get_base_data_ptr(dst); + void* ptr = torch_npu::utils::is_npu(dst) ? src.storage().mutable_data() : dst.storage().mutable_data(); NPU_CHECK_ERROR(CachingHostAllocator_recordEvent(ptr, kind, stream), "aclrtSynchronizeStreamWithTimeout"); } else { aclError error = c10_npu::acl::AclrtSynchronizeStreamWithTimeout(stream); diff --git a/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h b/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h index 234f361fba..56727a5299 100644 --- a/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h +++ b/torch_npu/csrc/aten/common/InnerNpuNativeFunction.h @@ -20,14 +20,6 @@ at::Tensor matmul_by_bmmV2(const at::Tensor& tensor1, const at::Tensor& tensor2) */ void npu_fast_reshape_(at::Tensor& tensor); -inline void* get_base_data_ptr(const at::Tensor &t) -{ - if (!t.is_view()) { - return t.data_ptr(); - } - return t._base().data_ptr(); -} - } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp index 4b999c937b..b4d5909158 100644 --- a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp +++ b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp @@ -44,7 +44,7 @@ void copy_between_host_and_device_opapi(at::Tensor& dst, const at::Tensor& src, auto ret = CalcuOpUtil::LaunchAsyncCopyTaskWithModeSwitch(dst, nbytes, src, nbytes, kind); NPU_CHECK_ERROR(ret); ASCEND_LOGD("non_blocking copy without StreamSynchronize."); - void* ptr = torch_npu::utils::is_npu(dst) ? get_base_data_ptr(src) : get_base_data_ptr(dst); + void* ptr = torch_npu::utils::is_npu(dst) ? src.storage().mutable_data() : dst.storage().mutable_data(); NPU_CHECK_ERROR(CachingHostAllocator_recordEvent(ptr, kind, stream), "aclrtSynchronizeStreamWithTimeout"); } else { aclError error = aclrtSynchronizeStream(stream); -- Gitee