diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index 6d97252ba45482cc4592ff707264a74fab44f67f..4b897a33ec37eadd0cdfb4479454e32a45b59fe9 100755 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -941,10 +941,11 @@ ACL_FUNC_VISIBILITY aclError aclrtMemcpyAsync(void *dst, /** * @ingroup AscendCL - * @brief Asynchronous memory replication between Host and Device + * @brief Asynchronous memory replication between Host and Device, would + * be synchronous if memory is not allocated via calling acl or rts api. * * @par Function - * After calling this interface, + * After calling this interface and memory is allocated via calling acl or rts api, * be sure to call the aclrtSynchronizeStream interface to ensure that * the task of memory replication has been completed * diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index e918d04ae73f2c5197533a53aaa8f117c6281da1..c97693eeb5bfddbec5eb57b9637f3bc5070d8360 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -1127,11 +1127,14 @@ aclError AclrtUnSubscribeReport(uint64_t theadId, aclrtStream stream) bool AclrtMemcpyAsyncWithConditionExist() { const static bool isAclrtMemcpyAsyncWithConditionExist = []() -> bool { - auto func = GET_FUNC(aclrtMemcpyAsyncWithCondition) + auto func = GET_FUNC(aclrtMemcpyAsyncWithCondition) + bool flag = c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910B1 ? true : false; if (func != nullptr) { ASCEND_LOGI("Successfully to find function aclrtMemcpyAsyncWithCondition"); + } else { + flag = false; } - return func != nullptr; + return flag; }(); return isAclrtMemcpyAsyncWithConditionExist; } diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index 3abd06421a90eee27edfd0d95756546d3af2709d..9a949332c85e1f8a1918409dfc75bcb5db6eefe2 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -451,19 +451,35 @@ int MemcopyAsyncFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) { auto cur_paras = static_cast(in->paramVal); logger->debug("MemcopyAsyncFunc Run."); - aclError ret = - aclrtMemcpyAsync(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); + aclError ret; + bool flag; + if (c10_npu::acl::AclrtMemcpyAsyncWithConditionExist() && cur_paras->kind == aclrtMemcpyKind::ACL_MEMCPY_DEVICE_TO_HOST) { + flag = true; + ret = c10_npu::acl::AclrtMemcpyAsyncWithCondition(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); + } else { + flag = false; + ret = aclrtMemcpyAsync(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); + } if (ret != ACL_ERROR_NONE) { auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); if (ret_temp != ACL_ERROR_NONE) { ret = ret_temp; } - ASCEND_LOGE( - "aclrtMemcpyAsync error! ret = %d, dstLen = %zu, srcLen = %zu, kind = %d", - ret, - cur_paras->dstLen, - cur_paras->srcLen, - cur_paras->kind); + if (flag) { + ASCEND_LOGE( + "aclrtMemcpyAsyncWithCondition error! ret = %d, dstLen = %zu, srcLen = %zu, kind = %d", + ret, + cur_paras->dstLen, + cur_paras->srcLen, + cur_paras->kind); + } else { + ASCEND_LOGE( + "aclrtMemcpyAsync error! ret = %d, dstLen = %zu, srcLen = %zu, kind = %d", + ret, + cur_paras->dstLen, + cur_paras->srcLen, + cur_paras->kind); + } } logger->debug("MemcopyAsyncFunc Run, ret = %d.", ret); return ret;